diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 495d0beaf3faa..d46fa76b9112e 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -63,6 +63,56 @@ levels ` documentation section. left.merge(right, on=['key1', 'key2']) +.. _whatsnew_0220.enhancements.read_csv: + +``read_csv`` use `python-requests` (if installed) to support basic auth and much more +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If `python-requests` library is installed try to use it first. If not, continue using urllib +The :meth:`DataFrame.read_csv`, :meth:`DataFrame.read_html`, :meth:`DataFrame.read_json`, +:meth:`DataFrame.read_excel` now allow optional param of ``http_params`` to pass in +parameters for basic auth, disable ssl strict check or even a requests.Session() object + + +.. ipython:: python + import pandas as pd + + # http_params is optional parameter. If it is non-empty, it attempts to use python-requests library + df = pd.read_csv('https://uname:pwd@aa.com/bb.csv', http_params= {'auth': None} ) # now url can contain username and pwd + # Note - all basic auth scenarios require python-requests library + + # Basic Auth + df = pd.read_csv('https://aa.com/bb.csv', http_params={ 'auth': ('john', 'pwd') } ) # now url can contain username and pwd + + # Basic Auth And disable verification of SSL certificate eg: testing + up = { 'auth': ('john', 'pwd') , 'verify' : False} + df = pd.read_csv('https://aa.com/bb.csv', http_params=up ) # now url can contain username and pwd + + # Optionally, A requests.Session() can also be passed into http_params + import requests + s = requests.Session() + s.auth = MyAuthProvider('secret-key') # custom auth provider supported by requests + df = pd.read_csv(url, http_params=s) + + # For advanced users, this may provide extensibility. However, testing on pandas side is limited to basic scenarios + # here is an example of advanced scenario + s = Session() + s.auth = ('darth', 'l0rd') # if user wants to perform basic auth Skip if url itself contains username and pwd + s.timeout = (3.05, 27) # if user wants to modify timeout + s.verify = False # if user wants to disable ssl cert verification + s.headers.update( {'User-Agent': 'Custom user agent'} ) # extensible to set any custom header needed + s.proxies = { 'http': 'http://a.com:100'} # if user has proxies + s.cert = '/path/client.cert' # if custom cert is needed + df = pd.read_csv( 'https://aa.com/bbb.csv', http_params=s) + + def print_http_status(r, *args, **kwargs): + print(r.status_code) + print(r.headers['Content-Length']) + s = Session() + s.hooks = dict(response=print_http_status) + df = pd.read_csv( 'https://aa.com/bbb.csv', http_params=s) + + .. _whatsnew_0220.enhancements.other: Other Enhancements diff --git a/pandas/io/common.py b/pandas/io/common.py index 534c1e0671150..bba1b6f1401ca 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -28,6 +28,13 @@ ]) +try: + import requests + _REQUESTS_INSTALLED = True +except ImportError: + _REQUESTS_INSTALLED = False + + if compat.PY3: from urllib.request import urlopen, pathname2url _urlopen = urlopen @@ -168,8 +175,87 @@ def _stringify_path(filepath_or_buffer): return filepath_or_buffer +def _is_handled_by_requests(o): + return _is_url(o) and parse_url(o).scheme in ['http', 'https'] + + +def gen_session(http_params): + """ + Generate python-requests session from http_params dict + """ + s = None + if http_params and type(http_params) is requests.sessions.Session: + s = http_params + else: + s = requests.Session() + s.stream = True + # Setting accept-encoding to None for backwards compatibility with + # urlopen. ideally we want to allow gzip download + # urlopen doesnt decompress automatically, requests does. + s.headers.update({'Accept-Encoding': None}) + if http_params and type(http_params) is dict: + if http_params.get('auth', None) and not s.auth: + s.auth = http_params.get('auth') + if http_params.get('verify', True) is False and s.verify is not False: + s.verify = False + return s + + +def fetch_url(url, http_params=None, skip_requests=False): + """ + If url is url, first try python-requests else try urllib. + Note if requests library is used, auto gunzip is + disabled for backwards compatibility of code with urlopen + + Parameters + ---------- + url : str + Could be: + 'http://cnn.com' + 'file:///home/sky/aaa.csv' + + http_params : dict or requests.Session(), default None + A python dict containing: + 'auth': tuple (str, str) eg (username, password) + 'auth': Any other auth object accepted by requests + 'verify': boolean, default True + If False, allow self signed and invalid SSL cert for https + or + A python requests.Session object if http(s) path to enable basic auth + and many other scenarios that requests allows + + .. versionadded:: 0.22.0 + + skip_requests : boolean, default False + for testing - disable `requests` library Internal use only + + .. versionadded:: 0.22.0 + Raises + ------ + ValueError if http_params specified without installed python-requests pkg + """ + if not http_params: + skip_requests = True + if (not skip_requests) and \ + _REQUESTS_INSTALLED and \ + _is_handled_by_requests(url): + s = gen_session(http_params) + resp = s.get(url) + resp.raise_for_status() + content_bytes = resp.content + else: + if http_params and (skip_requests or not _REQUESTS_INSTALLED): + msg = 'To utilize http_params, python-requests library is ' + \ + 'required but not detected' + raise ValueError(msg) + resp = _urlopen(url) + content_bytes = resp.read() + return resp, content_bytes + + def get_filepath_or_buffer(filepath_or_buffer, encoding=None, - compression=None): + compression=None, http_params=None, + skip_requests=False): """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. @@ -180,19 +266,45 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, or buffer encoding : the encoding to use to decode py3 bytes, default is 'utf-8' + compression : str, default None + indicate the compression such as 'gzip'. + + http_params : dict or requests.Session(), default None + A python dict containing: + 'auth': tuple (str, str) eg (unae, pwd) + 'auth': Any other auth object accepted by requests + 'verify': boolean, default True + If False, allow self signed and invalid SSL cert for https + or + A python requests.Session object if http(s) path to enable basic auth + and many other scenarios that requests allows + + .. versionadded:: 0.22.0 + + skip_requests : boolean, default False + for testing - disable `requests` library Internal use only + + .. versionadded:: 0.22.0 + Returns ------- a filepath_or_buffer, the encoding, the compression + + Raises + ------ + ValueError if http_params specified without installed python-requests pkg """ filepath_or_buffer = _stringify_path(filepath_or_buffer) if _is_url(filepath_or_buffer): - req = _urlopen(filepath_or_buffer) + req, content_bytes = fetch_url(filepath_or_buffer, + http_params, + skip_requests) + reader = BytesIO(content_bytes) content_encoding = req.headers.get('Content-Encoding', None) if content_encoding == 'gzip': # Override compression based on Content-Encoding header compression = 'gzip' - reader = BytesIO(req.read()) return reader, encoding, compression if _is_s3_url(filepath_or_buffer): diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 882130bedcbf0..78a8229cd7933 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -15,10 +15,11 @@ is_integer, is_float, is_bool, is_list_like) +from pandas.compat import BytesIO from pandas.core.frame import DataFrame from pandas.io.parsers import TextParser from pandas.errors import EmptyDataError -from pandas.io.common import (_is_url, _urlopen, _validate_header_arg, +from pandas.io.common import (_is_url, fetch_url, _validate_header_arg, get_filepath_or_buffer, _NA_VALUES, _stringify_path) from pandas.core.indexes.period import Period @@ -148,6 +149,19 @@ data will be read in as floats: Excel stores all numbers as floats internally +http_params : dict or requests.Session(), default None + A python dict containing: + 'auth': tuple (str, str) eg (unae, pwd) + 'auth': Any other auth object accepted by requests + 'verify': boolean, Default True + If False, allow self signed and invalid SSL certs for https + or + A python requests.Session object if http(s) path to enable basic auth + and many other scenarios that requests allows + + .. versionadded:: 0.22.0 + + Returns ------- parsed : DataFrame or Dict of DataFrames @@ -199,7 +213,6 @@ def read_excel(io, sheet_name=0, header=0, skiprows=None, skip_footer=0, convert_float=True, converters=None, dtype=None, true_values=None, false_values=None, engine=None, squeeze=False, **kwds): - # Can't use _deprecate_kwarg since sheetname=None has a special meaning if is_integer(sheet_name) and sheet_name == 0 and 'sheetname' in kwds: warnings.warn("The `sheetname` keyword is deprecated, use " @@ -210,7 +223,10 @@ def read_excel(io, sheet_name=0, header=0, skiprows=None, skip_footer=0, "Use just `sheet_name`") if not isinstance(io, ExcelFile): - io = ExcelFile(io, engine=engine) + ukwds = {} + if kwds.get('http_params', None) is not None: + ukwds['http_params'] = kwds.get('http_params') + io = ExcelFile(io, engine=engine, **ukwds) return io._parse_excel( sheetname=sheet_name, header=header, skiprows=skiprows, names=names, @@ -263,7 +279,9 @@ def __init__(self, io, **kwds): # If io is a url, want to keep the data as bytes so can't pass # to get_filepath_or_buffer() if _is_url(self._io): - io = _urlopen(self._io) + hp = kwds.get('http_params', None) + req, content = fetch_url(self._io, http_params=hp) + io = BytesIO(content) elif not isinstance(self.io, (ExcelFile, xlrd.Book)): io, _, _ = get_filepath_or_buffer(self._io) diff --git a/pandas/io/html.py b/pandas/io/html.py index d0861f1aa4ec6..9ae6300ef5364 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -14,7 +14,7 @@ from pandas.core.dtypes.common import is_list_like from pandas.errors import EmptyDataError -from pandas.io.common import (_is_url, urlopen, +from pandas.io.common import (_is_url, fetch_url, parse_url, _validate_header_arg) from pandas.io.parsers import TextParser from pandas.compat import (lrange, lmap, u, string_types, iteritems, @@ -116,20 +116,31 @@ def _get_skiprows(skiprows): type(skiprows).__name__) -def _read(obj): +def _read(obj, http_params=None): """Try to read from a url, file or string. Parameters ---------- obj : str, unicode, or file-like + http_params : dict or requests.Session(), default None + A python dict containing: + 'auth': tuple (str, str) eg (unae, pwd) + 'auth': Any other auth object accepted by requests + 'verify': boolean, default True + If False, allow self signed and invalid SSL certs for https + or + A python requests.Session object if http(s) path to enable basic auth + and many other scenarios that requests allows + + .. versionadded:: 0.22.0 + Returns ------- raw_text : str """ if _is_url(obj): - with urlopen(obj) as url: - text = url.read() + req, text = fetch_url(obj, http_params) elif hasattr(obj, 'read'): text = obj.read() elif isinstance(obj, char_types): @@ -172,6 +183,24 @@ class _HtmlFrameParser(object): A dictionary of valid table attributes to use to search for table elements. + encoding : str or None, optional + The encoding used to decode the web page. Defaults to ``None``.``None`` + preserves the previous encoding behavior, which depends on the + underlying parser library (e.g., the parser library will try to use + the encoding provided by the document). + + http_params : dict or requests.Session(), default None + A python dict containing: + 'auth': tuple (str, str) eg (username, password) + 'auth': Any other auth object accepted by requests + 'verify': boolean, default True + If False, allow self signed and invalid SSL cert for https + or + A python requests.Session object if http(s) path to enable basic auth + and many other scenarios that requests allows + + .. versionadded:: 0.22.0 + Notes ----- To subclass this class effectively you must override the following methods: @@ -187,11 +216,12 @@ class _HtmlFrameParser(object): functionality. """ - def __init__(self, io, match, attrs, encoding): + def __init__(self, io, match, attrs, encoding, http_params=None): self.io = io self.match = match self.attrs = attrs self.encoding = encoding + self.http_params = http_params def parse_tables(self): tables = self._parse_tables(self._build_doc(), self.match, self.attrs) @@ -444,7 +474,7 @@ def _parse_tables(self, doc, match, attrs): return result def _setup_build_doc(self): - raw_text = _read(self.io) + raw_text = _read(self.io, self.http_params) if not raw_text: raise ValueError('No text parsed from document: {doc}' .format(doc=self.io)) @@ -737,7 +767,8 @@ def _parse(flavor, io, match, attrs, encoding, **kwargs): retained = None for flav in flavor: parser = _parser_dispatch(flav) - p = parser(io, compiled_match, attrs, encoding) + p = parser(io, compiled_match, attrs, encoding, + http_params=kwargs.get('http_params', None)) try: tables = p.parse_tables() @@ -773,7 +804,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, skiprows=None, attrs=None, parse_dates=False, tupleize_cols=None, thousands=',', encoding=None, decimal='.', converters=None, na_values=None, - keep_default_na=True): + keep_default_na=True, http_params=None): r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. Parameters @@ -877,6 +908,12 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, .. versionadded:: 0.19.0 + http_params : requests.Session(), default None + A python requests.Session object if http(s) path to enable basic auth + and many other scenarios that requests allows + + .. versionadded:: 0.22.0 + Returns ------- dfs : list of DataFrames @@ -924,4 +961,4 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, parse_dates=parse_dates, tupleize_cols=tupleize_cols, thousands=thousands, attrs=attrs, encoding=encoding, decimal=decimal, converters=converters, na_values=na_values, - keep_default_na=keep_default_na) + keep_default_na=keep_default_na, http_params=http_params) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 21736673350d8..2be0745d71446 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -182,7 +182,8 @@ def write(self): def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False, precise_float=False, date_unit=None, encoding=None, - lines=False, chunksize=None, compression='infer'): + lines=False, chunksize=None, compression='infer', + http_params=None): """ Convert a JSON string to pandas object @@ -290,6 +291,18 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, .. versionadded:: 0.21.0 + http_params : dict or requests.Session(), default None + A python dict containing: + 'auth': tuple (str, str) eg (unae, pwd) + 'auth': Any other auth object accepted by requests + 'verify': boolean, Default True + If False, allow self signed and invalid SSL certs for https + or + A python requests.Session object if http(s) path to enable basic auth + and many other scenarios that requests allows + + .. versionadded:: 0.22.0 + Returns ------- result : Series or DataFrame, depending on the value of `typ`. @@ -350,6 +363,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, compression = _infer_compression(path_or_buf, compression) filepath_or_buffer, _, compression = get_filepath_or_buffer( path_or_buf, encoding=encoding, compression=compression, + http_params=http_params ) json_reader = JsonReader( diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 83b1d8ec1a070..caee47e058286 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -427,10 +427,11 @@ def _read(filepath_or_buffer, kwds): encoding = re.sub('_', '-', encoding).lower() kwds['encoding'] = encoding + http_params = kwds.get('http_params', None) compression = kwds.get('compression') compression = _infer_compression(filepath_or_buffer, compression) filepath_or_buffer, _, compression = get_filepath_or_buffer( - filepath_or_buffer, encoding, compression) + filepath_or_buffer, encoding, compression, http_params) kwds['compression'] = compression if kwds.get('date_parser', None) is not None: @@ -624,7 +625,11 @@ def parser_f(filepath_or_buffer, low_memory=_c_parser_defaults['low_memory'], buffer_lines=None, memory_map=False, - float_precision=None): + float_precision=None, + + # python requests session + http_params=None, + ): # Alias sep -> delimiter. if delimiter is None: @@ -704,7 +709,8 @@ def parser_f(filepath_or_buffer, mangle_dupe_cols=mangle_dupe_cols, tupleize_cols=tupleize_cols, infer_datetime_format=infer_datetime_format, - skip_blank_lines=skip_blank_lines) + skip_blank_lines=skip_blank_lines, + http_params=http_params) return _read(filepath_or_buffer, kwds) diff --git a/pandas/tests/io/test_http_auth.py b/pandas/tests/io/test_http_auth.py new file mode 100644 index 0000000000000..3c28fb4522fb0 --- /dev/null +++ b/pandas/tests/io/test_http_auth.py @@ -0,0 +1,213 @@ +import pytest +import pandas as pd +import pandas.util.testing as tm + +if pd.io.common._REQUESTS_INSTALLED: + from requests.packages.urllib3.exceptions import InsecureRequestWarning + import requests + requests_pkg = True +else: + requests_pkg = False + + +def _skip_if_no_requests(): + if not requests_pkg: + pytest.skip('python-requests not installed, skipping') + + +uname = 'pandasusr' +pwd = 'pandaspwd' +no_auth_path = 'no_auth/' +basic_auth_path = 'basic_auth/' +valid_ssl_url = 'handsome-equator.000webhostapp.com' +invalid_ssl_url = 'pandas-unittest.site11.com' + + +def gen_http_auth_ssl_test_cases(uname, + pwd, + is_auth, + sub_path): + """ + Generate list of test case to test for : http/https, username/pwd in url + or as parameters, self signed ssl certs or trusted ssl certs, no auth + or basic auth + """ + def gen_level1_tc(): + test_cases = [] + # The following host doesn't seem to handle urllib but handles + # python requests package. This is because: + # 'urlopen' sets header 'Host' : ':' - acceptable RFC7230 + # 'requests' sets header 'Host' : '' + # so pandas fails on following hosting server (uses some 'apex' server) + # but pandas succeeds on nginx even if port is non-default. + for host, verify_ssl in [(invalid_ssl_url, False), + (valid_ssl_url, True) + ]: + pre_ports = [('http', ''), + ('https', '')] + for pre, port in pre_ports: + test_cases.append( + [host, verify_ssl, pre, port, sub_path, is_auth]) + return test_cases + + def gen_base_url(pre, auth_prefix, host, port, su_pa): + return '{}://{}{}{}/{}'.format(pre, auth_prefix, host, port, su_pa) + tc2 = [] + for host, verify_ssl, pre, port, sp, is_auth in gen_level1_tc(): + u = uname if is_auth else None + p = pwd if is_auth else None + u_no_uname = gen_base_url(pre, '', host, port, sp) + u_with_uname = None + if is_auth: + auth_prefix = '{}:{}@'.format(u, p) if is_auth else '' + u_with_uname = gen_base_url(pre, auth_prefix, host, port, sp) + tc2.append([u_no_uname, u, p, verify_ssl]) + if u_with_uname and u_with_uname != u_no_uname: + tc2.append([u_with_uname, None, None, verify_ssl]) + else: + tc2.append([u_no_uname, None, None, verify_ssl]) + return tc2 + + +valid_no_auth = gen_http_auth_ssl_test_cases(uname='', + pwd='', + is_auth=False, + sub_path=no_auth_path) + +valid_auth = gen_http_auth_ssl_test_cases(uname=uname, + pwd=pwd, + is_auth=True, + sub_path=basic_auth_path) + + +@pytest.mark.slow +@pytest.mark.parametrize('url, uname, pwd, verify_ssl', + valid_auth) +def test_http_valid_auth(url, uname, pwd, verify_ssl): + _skip_if_no_requests() + check_http_auth(url, uname, pwd, verify_ssl) + + +@pytest.mark.slow +@pytest.mark.parametrize('url, uname, pwd, verify_ssl', + valid_no_auth) +def test_http_valid_no_auth(url, uname, pwd, verify_ssl): + if verify_ssl is False: + _skip_if_no_requests() + if (verify_ssl in [True, False] or uname or pwd) and not requests_pkg: + with pytest.raises(ValueError): + check_http_auth(url, uname, pwd, verify_ssl) + else: + check_http_auth(url, uname, pwd, verify_ssl) + + +wrong_auth = gen_http_auth_ssl_test_cases(uname='fakepwd', + pwd='fakepwd', + is_auth=True, + sub_path=basic_auth_path) + + +@pytest.mark.slow +@pytest.mark.parametrize('url, uname, pwd, verify_ssl', + wrong_auth) +def test_http_invalid_auth(url, uname, pwd, verify_ssl): + _skip_if_no_requests() + with pytest.raises(requests.exceptions.HTTPError): + check_http_auth(url, uname, pwd, verify_ssl) + + +blank_uname = gen_http_auth_ssl_test_cases(uname='', + pwd='fakepwd', + is_auth=True, + sub_path=basic_auth_path) + +blank_pwd = gen_http_auth_ssl_test_cases(uname='fakepwd', + pwd='', + is_auth=True, + sub_path=basic_auth_path) + + +def match_csv(df): + tcsv = 'animal,bird\ndog,pigeon\ncat,emu\n' + return str(df.to_csv(index=False)) == tcsv + + +def match_json(df): + j = '{"animal":{"0":"dog","1":"cat"},"bird":{"0":"pigeon","1":"emu"}}' + return str(df.to_json()) == j + + +@tm.network +def check_http_auth(url, uname, pwd, verify_ssl): + + def get_df(url, uname, pwd, verify_ssl, pd_read_fn, fname): + furl = url + fname + http_params = {} + if uname or pwd: + http_params['auth'] = (uname, pwd) + if verify_ssl is not None: + http_params['verify'] = verify_ssl + msg = '{0: <90} -- auth:[{1: <10}/{2: <10}] v:[{3: <5}]'.format( + furl, str(uname), str(pwd), str(verify_ssl)) + if verify_ssl or furl.lower().startswith('http://'): + df = pd_read_fn(furl, http_params=http_params) + else: + with tm.assert_produces_warning(InsecureRequestWarning): + df = pd_read_fn(furl, http_params=http_params) + if type(df) is list: # html + df = df[0] + smatch = match_csv(df) + jmatch = match_json(df) + res = 'Json : {} -- String: {}'.format(jmatch, smatch) + if not jmatch or not smatch: + raise Exception(' ** ERROR:' + res) + else: + res += ' OK' + print(msg + ' ' + res) + return True + + for pd_read_fn, fname in [(pd.read_csv, 'aaa.csv'), + (pd.read_json, 'jdoc.json'), + (pd.read_excel, 'ex_doc.xlsx'), + (pd.read_html, 'html_file.html') + ]: + assert get_df(url, uname, pwd, verify_ssl, pd_read_fn, fname) + return + + +@pytest.mark.parametrize('up', + [ + None, + {}, + {'auth': ('uname', 'pwd')}, + {'verify': False}, + {'auth': ('uname', 'pwd'), 'verify': False}, + ] + ) +def test_http_params(up): + _skip_if_no_requests() + s = pd.io.common.gen_session(http_params=up) + assert type(s) is requests.sessions.Session + if up and up.get('auth', None): + assert s.auth == up.get('auth', None) + if up and (up.get('verify', True) is False): + assert s.verify == up.get('verify', True) + + +def test_pass_session_obj(): + _skip_if_no_requests() + s = requests.sessions.Session() + s.auth = ('uname', 'pwd') + s.verify = False + t = pd.io.common.gen_session(http_params=s) + assert s == t + assert s.auth == t.auth + assert s.verify == t.verify + + +def test_skip_requests(): + with pytest.raises(ValueError): + a = (uname, pwd) + resp, content_bytes = pd.io.common.fetch_url('http://cnn.com', + http_params={'auth': a}, + skip_requests=True)