Skip to content

HTML Parsing Cleanup #5395

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,9 @@ Improvements to existing features
by color as expected.
- ``read_excel()`` now tries to convert integral floats (like ``1.0``) to int
by default. (:issue:`5394`)

- ``read_html`` can accept a subclass of ``Flavor`` rather than a string for
the parsing flavor. This allows user written HTML parsers.

API Changes
~~~~~~~~~~~

Expand Down
72 changes: 49 additions & 23 deletions pandas/io/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,14 @@ def _read(io):
return raw_text


class Flavor(object):
"""
Mixin class used to specify a user written HTML parsing flavor
Used by making the user written parser a subclass of Flavor
"""
pass


class _HtmlFrameParser(object):
"""Base class for parsers that parse HTML into DataFrames.

Expand Down Expand Up @@ -165,13 +173,14 @@ class _HtmlFrameParser(object):
See each method's respective documentation for details on their
functionality.
"""
def __init__(self, io, match, attrs):
self.io = io
def __init__(self, match, attrs):
self.match = match
self.attrs = attrs

def parse_tables(self):
tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
def parse_tables(self, io):
tables = self._parse_tables(self._build_doc(io),
self.match,
self.attrs)
return (self._build_table(table) for table in tables)

def _parse_raw_data(self, rows):
Expand Down Expand Up @@ -314,7 +323,7 @@ def _parse_tfoot(self, table):
"""
raise NotImplementedError

def _build_doc(self):
def _build_doc(self, io):
"""Return a tree-like object that can be used to iterate over the DOM.

Returns
Expand Down Expand Up @@ -414,15 +423,15 @@ def _parse_tables(self, doc, match, attrs):
match.pattern)
return result

def _setup_build_doc(self):
raw_text = _read(self.io)
def _setup_build_doc(self, io):
raw_text = _read(io)
if not raw_text:
raise ValueError('No text parsed from document: %s' % self.io)
raise ValueError('No text parsed from document: %s' % io)
return raw_text

def _build_doc(self):
def _build_doc(self, io):
from bs4 import BeautifulSoup
return BeautifulSoup(self._setup_build_doc(), features='html5lib')
return BeautifulSoup(self._setup_build_doc(io), features='html5lib')


def _build_xpath_expr(attrs):
Expand Down Expand Up @@ -500,7 +509,11 @@ def _parse_tables(self, doc, match, kwargs):
raise ValueError("No tables found matching regex %r" % pattern)
return tables

def _build_doc(self):
def _get_parser(self):
from lxml.html import HTMLParser
return HTMLParser(recover=False)

def _build_doc(self, io):
"""
Raises
------
Expand All @@ -516,31 +529,31 @@ def _build_doc(self):
--------
pandas.io.html._HtmlFrameParser._build_doc
"""
from lxml.html import parse, fromstring, HTMLParser
from lxml.html import parse, fromstring
from lxml.etree import XMLSyntaxError

parser = HTMLParser(recover=False)
parser = self._get_parser()

try:
# try to parse the input in the simplest way
r = parse(self.io, parser=parser)
r = parse(io, parser=parser)

try:
r = r.getroot()
except AttributeError:
pass
except (UnicodeDecodeError, IOError):
# if the input is a blob of html goop
if not _is_url(self.io):
r = fromstring(self.io, parser=parser)
if not _is_url(io):
r = fromstring(io, parser=parser)

try:
r = r.getroot()
except AttributeError:
pass
else:
# not a url
scheme = parse_url(self.io).scheme
scheme = parse_url(io).scheme
if scheme not in _valid_schemes:
# lxml can't parse it
msg = ('%r is not a valid url scheme, valid schemes are '
Expand Down Expand Up @@ -611,7 +624,16 @@ def _data_to_frame(data, header, index_col, skiprows, infer_types,

_valid_parsers = {'lxml': _LxmlFrameParser, None: _LxmlFrameParser,
'html5lib': _BeautifulSoupHtml5LibFrameParser,
'bs4': _BeautifulSoupHtml5LibFrameParser}
'bs4': _BeautifulSoupHtml5LibFrameParser,
}


def _is_flavor(flav):
return np.issubclass_(flav, Flavor)


def _is_string_or_flavor(flav):
return isinstance(flav, string_types) or _is_flavor(flav)


def _parser_dispatch(flavor):
Expand All @@ -634,6 +656,9 @@ def _parser_dispatch(flavor):
ImportError
* If you do not have the requested `flavor`
"""
if _is_flavor(flavor):
return flavor

valid_parsers = list(_valid_parsers.keys())
if flavor not in valid_parsers:
raise ValueError('%r is not a valid flavor, valid flavors are %s' %
Expand Down Expand Up @@ -665,10 +690,10 @@ def _print_as_set(s):
def _validate_flavor(flavor):
if flavor is None:
flavor = 'lxml', 'bs4'
elif isinstance(flavor, string_types):
elif _is_string_or_flavor(flavor):
flavor = flavor,
elif isinstance(flavor, collections.Iterable):
if not all(isinstance(flav, string_types) for flav in flavor):
if not all(_is_string_or_flavor(flav) for flav in flavor):
raise TypeError('Object of type %r is not an iterable of strings' %
type(flavor).__name__)
else:
Expand All @@ -679,8 +704,9 @@ def _validate_flavor(flavor):
flavor = tuple(flavor)
valid_flavors = set(_valid_parsers)
flavor_set = set(flavor)
flavor_set_flavors = [f for f in flavor_set if _is_flavor(f)]

if not flavor_set & valid_flavors:
if not flavor_set & valid_flavors and not flavor_set_flavors:
raise ValueError('%s is not a valid set of flavors, valid flavors are '
'%s' % (_print_as_set(flavor_set),
_print_as_set(valid_flavors)))
Expand All @@ -696,10 +722,10 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types,
retained = None
for flav in flavor:
parser = _parser_dispatch(flav)
p = parser(io, compiled_match, attrs)
p = parser(compiled_match, attrs)

try:
tables = p.parse_tables()
tables = p.parse_tables(io)
except Exception as caught:
retained = caught
else:
Expand Down
31 changes: 29 additions & 2 deletions pandas/io/tests/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
date_range, Series)
from pandas.compat import map, zip, StringIO, string_types
from pandas.io.common import URLError, urlopen, file_path_to_url
from pandas.io.html import read_html
from pandas.io.html import read_html, Flavor

import pandas.util.testing as tm
from pandas.util.testing import makeCustomDataframe as mkdf, network
Expand Down Expand Up @@ -602,7 +602,34 @@ def test_data_fail(self):

with tm.assertRaises(XMLSyntaxError):
self.read_html(banklist_data, flavor=['lxml'])


def test_custom_html_parser1(self):
import re
t_match = re.compile(".")
t_attrs = object()
that = self
class _CustomHtmlFrameParser(Flavor):
def __init__(self, match, attrs):
that.assertTrue(t_match is match)
that.assertTrue(t_attrs is attrs)

def parse_tables(self, io):
return [[[],[["a", "b"],[1,2]],[]]]

banklist_data = os.path.join(DATA_PATH, 'banklist.html')

dfs = self.read_html(banklist_data, flavor=[_CustomHtmlFrameParser], match=t_match, attrs=t_attrs)
for df in dfs:
tm.assert_isinstance(df, DataFrame)
self.assertFalse(df.empty)
self.assertEqual(df[0][0], "a")

dfs = self.read_html(banklist_data, flavor=_CustomHtmlFrameParser, match=t_match, attrs=t_attrs)
for df in dfs:
tm.assert_isinstance(df, DataFrame)
self.assertFalse(df.empty)
self.assertEqual(df[1][1], 2)

def test_works_on_valid_markup(self):
filename = os.path.join(DATA_PATH, 'valid_markup.html')
dfs = self.read_html(filename, index_col=0, flavor=['lxml'])
Expand Down