diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 9f88d629880ed..b21ea88bf177f 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -70,6 +70,7 @@ I/O - Bug in pd.read_csv() when comment is passed in space deliminted text files (:issue:`16472`) - Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`) - Bug that raised IndexError HTML-rendering an empty DataFrame (:issue:`15953`) +- Bug in ``pd.read_csv()`` in which tarfile object inputs were raising an error in Python 2.x for the C engine (:issue:`16530`) - Bug in ``HDFStore.select_as_multiple()`` where start/stop arguments were not respected (:issue:`16209`) diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index a5316a83612cb..ff7e215951a1f 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -171,7 +171,7 @@ def is_file_like(obj): if not (hasattr(obj, 'read') or hasattr(obj, 'write')): return False - if not is_iterator(obj): + if not hasattr(obj, "__iter__"): return False return True diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 12b606d969c7d..aab70c8ce2cd4 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -13,7 +13,7 @@ import numpy as np from pandas import compat -from pandas.compat import (range, lrange, StringIO, lzip, +from pandas.compat import (range, lrange, PY3, StringIO, lzip, zip, string_types, map, u) from pandas.core.dtypes.common import ( is_integer, _ensure_object, @@ -31,10 +31,10 @@ from pandas.core.common import AbstractMethodError from pandas.io.date_converters import generic_parser from pandas.errors import ParserWarning, ParserError, EmptyDataError -from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg, - _get_handle, UnicodeReader, UTF8Recoder, - BaseIterator, - _NA_VALUES, _infer_compression) +from pandas.io.common import (get_filepath_or_buffer, is_file_like, + _validate_header_arg, _get_handle, + UnicodeReader, UTF8Recoder, _NA_VALUES, + BaseIterator, _infer_compression) from pandas.core.tools import datetimes as tools from pandas.util._decorators import Appender @@ -755,7 +755,9 @@ def __init__(self, f, engine=None, **kwds): self.squeeze = options.pop('squeeze', False) # might mutate self.engine + self.engine = self._check_file_or_buffer(f, engine) self.options, self.engine = self._clean_options(options, engine) + if 'has_index_names' in kwds: self.options['has_index_names'] = kwds['has_index_names'] @@ -801,6 +803,23 @@ def _get_options_with_defaults(self, engine): return options + def _check_file_or_buffer(self, f, engine): + # see gh-16530 + if is_file_like(f): + next_attr = "__next__" if PY3 else "next" + + # The C engine doesn't need the file-like to have the "next" or + # "__next__" attribute. However, the Python engine explicitly calls + # "next(...)" when iterating through such an object, meaning it + # needs to have that attribute ("next" for Python 2.x, "__next__" + # for Python 3.x) + if engine != "c" and not hasattr(f, next_attr): + msg = ("The 'python' engine cannot iterate " + "through this file buffer.") + raise ValueError(msg) + + return engine + def _clean_options(self, options, engine): result = options.copy() diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index b88481abcb2ec..ec5fe45d7f610 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -120,9 +120,9 @@ class MockFile(object): m = MockFile() assert not is_file(m) + # gh-16530: Valid iterator just means we have the + # __iter__ attribute for our purposes. MockFile.__iter__ = lambda self: self - MockFile.__next__ = lambda self: 0 - MockFile.next = MockFile.__next__ # Valid write-only file m = MockFile() diff --git a/pandas/tests/io/parser/c_parser_only.py b/pandas/tests/io/parser/c_parser_only.py index 56ac10404b7b2..48812c04e3b55 100644 --- a/pandas/tests/io/parser/c_parser_only.py +++ b/pandas/tests/io/parser/c_parser_only.py @@ -7,7 +7,9 @@ further arguments when parsing. """ +import os import sys +import tarfile import pytest import numpy as np @@ -446,3 +448,37 @@ def test_comment_whitespace_delimited(self): [7, np.nan], [8, np.nan]]) tm.assert_frame_equal(df, expected) + + def test_file_like_no_next(self): + # gh-16530: the file-like need not have a "next" or "__next__" + # attribute despite having an "__iter__" attribute. + # + # NOTE: This is only true for the C engine, not Python engine. + class NoNextBuffer(StringIO): + def __next__(self): + raise AttributeError("No next method") + + next = __next__ + + data = "a\n1" + + expected = pd.DataFrame({"a": [1]}) + result = self.read_csv(NoNextBuffer(data)) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("tar_suffix", [".tar", ".tar.gz"]) + def test_read_tarfile(self, tar_suffix): + # see gh-16530 + # + # Unfortunately, Python's CSV library can't handle + # tarfile objects (expects string, not bytes when + # iterating through a file-like). + tar_path = os.path.join(self.dirpath, "tar_csv" + tar_suffix) + + tar = tarfile.open(tar_path, "r") + data_file = tar.extractfile("tar_data.csv") + + out = self.read_csv(data_file) + expected = pd.DataFrame({"a": [1]}) + tm.assert_frame_equal(out, expected) diff --git a/pandas/tests/io/parser/data/tar_csv.tar b/pandas/tests/io/parser/data/tar_csv.tar new file mode 100644 index 0000000000000..d1819550e0a00 Binary files /dev/null and b/pandas/tests/io/parser/data/tar_csv.tar differ diff --git a/pandas/tests/io/parser/data/tar_csv.tar.gz b/pandas/tests/io/parser/data/tar_csv.tar.gz new file mode 100644 index 0000000000000..b5a0f3e1b5805 Binary files /dev/null and b/pandas/tests/io/parser/data/tar_csv.tar.gz differ diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 3f62ff44531fb..5d248f2fef59c 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -16,6 +16,13 @@ from pandas.errors import ParserError from pandas.io.parsers import read_csv, read_table +import pytest + + +@pytest.fixture(params=["python", "python-fwf"], ids=lambda val: val) +def python_engine(request): + return request.param + class TestUnsupportedFeatures(object): @@ -82,7 +89,7 @@ def test_c_engine(self): with tm.assert_raises_regex(ValueError, msg): read_csv(StringIO(data), lineterminator='~~') - def test_python_engine(self): + def test_python_engine(self, python_engine): from pandas.io.parsers import _python_unsupported as py_unsupported data = """1,2,3,, @@ -90,16 +97,32 @@ def test_python_engine(self): 1,2,3,4,5 1,2,,, 1,2,3,4,""" - engines = 'python', 'python-fwf' - for engine in engines: - for default in py_unsupported: - msg = ('The %r option is not supported ' - 'with the %r engine' % (default, engine)) + for default in py_unsupported: + msg = ('The %r option is not supported ' + 'with the %r engine' % (default, python_engine)) + + kwargs = {default: object()} + with tm.assert_raises_regex(ValueError, msg): + read_csv(StringIO(data), engine=python_engine, **kwargs) - kwargs = {default: object()} - with tm.assert_raises_regex(ValueError, msg): - read_csv(StringIO(data), engine=engine, **kwargs) + def test_python_engine_file_no_next(self, python_engine): + # see gh-16530 + class NoNextBuffer(object): + def __init__(self, csv_data): + self.data = csv_data + + def __iter__(self): + return self + + def read(self): + return self.data + + data = "a\n1" + msg = "The 'python' engine cannot iterate" + + with tm.assert_raises_regex(ValueError, msg): + read_csv(NoNextBuffer(data), engine=python_engine) class TestDeprecatedFeatures(object): diff --git a/setup.py b/setup.py index 82d5f407228a9..31a3cddc3f9fd 100755 --- a/setup.py +++ b/setup.py @@ -702,6 +702,8 @@ def pxd(name): 'parser/data/*.gz', 'parser/data/*.bz2', 'parser/data/*.txt', + 'parser/data/*.tar', + 'parser/data/*.tar.gz', 'sas/data/*.csv', 'sas/data/*.xpt', 'sas/data/*.sas7bdat',