Skip to content

read_html(): rewinding [wip] #18017

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Nov 1, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.22.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ Indexing
I/O
^^^

-
- :func:`read_html` now rewinds seekable IO objects after parse failure, before attempting to parse with a new parser. If a parser errors and the object is non-seekable, an informative error is raised suggesting the use of a different parser (:issue:`17975`)
-
-

Expand Down
12 changes: 12 additions & 0 deletions pandas/io/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -742,6 +742,18 @@ def _parse(flavor, io, match, attrs, encoding, **kwargs):
try:
tables = p.parse_tables()
except Exception as caught:
# if `io` is an io-like object, check if it's seekable
# and try to rewind it before trying the next parser
if hasattr(io, 'seekable') and io.seekable():
io.seek(0)
elif hasattr(io, 'seekable') and not io.seekable():
# if we couldn't rewind it, let the user know
raise ValueError('The flavor {} failed to parse your input. '
'Since you passed a non-rewindable file '
'object, we can\'t rewind it to try '
'another parser. Try read_html() with a '
'different flavor.'.format(flav))

retained = caught
else:
break
Expand Down
50 changes: 50 additions & 0 deletions pandas/tests/io/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -967,3 +967,53 @@ def test_importcheck_thread_safety():
while helper_thread1.is_alive() or helper_thread2.is_alive():
pass
assert None is helper_thread1.err is helper_thread2.err


def test_parse_failure_unseekable():
# Issue #17975
_skip_if_no('lxml')

class UnseekableStringIO(StringIO):
def seekable(self):
return False

good = UnseekableStringIO('''
<table><tr><td>spam<br />eggs</td></tr></table>''')
bad = UnseekableStringIO('''
<table><tr><td>spam<foobr />eggs</td></tr></table>''')

assert read_html(good)
assert read_html(bad, flavor='bs4')

bad.seek(0)

with pytest.raises(ValueError,
match='passed a non-rewindable file object'):
read_html(bad)


def test_parse_failure_rewinds():
# Issue #17975
_skip_if_no('lxml')

class MockFile(object):
def __init__(self, data):
self.data = data
self.at_end = False

def read(self, size=None):
data = '' if self.at_end else self.data
self.at_end = True
return data

def seek(self, offset):
self.at_end = False

def seekable(self):
return True

good = MockFile('<table><tr><td>spam<br />eggs</td></tr></table>')
bad = MockFile('<table><tr><td>spam<foobr />eggs</td></tr></table>')

assert read_html(good)
assert read_html(bad)