Skip to content

COMPAT: Consider Python 2.x tarfiles file-like #16533

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 1, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ I/O
- Bug in pd.read_csv() when comment is passed in space deliminted text files (:issue:`16472`)
- Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`)
- Bug that raised IndexError HTML-rendering an empty DataFrame (:issue:`15953`)
- Bug in ``pd.read_csv()`` in which tarfile object inputs were raising an error in Python 2.x for the C engine (:issue:`16530`)

- Bug in ``HDFStore.select_as_multiple()`` where start/stop arguments were not respected (:issue:`16209`)

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/dtypes/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def is_file_like(obj):
if not (hasattr(obj, 'read') or hasattr(obj, 'write')):
return False

if not is_iterator(obj):
if not hasattr(obj, "__iter__"):
return False

return True
Expand Down
29 changes: 24 additions & 5 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import numpy as np

from pandas import compat
from pandas.compat import (range, lrange, StringIO, lzip,
from pandas.compat import (range, lrange, PY3, StringIO, lzip,
zip, string_types, map, u)
from pandas.core.dtypes.common import (
is_integer, _ensure_object,
Expand All @@ -31,10 +31,10 @@
from pandas.core.common import AbstractMethodError
from pandas.io.date_converters import generic_parser
from pandas.errors import ParserWarning, ParserError, EmptyDataError
from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg,
_get_handle, UnicodeReader, UTF8Recoder,
BaseIterator,
_NA_VALUES, _infer_compression)
from pandas.io.common import (get_filepath_or_buffer, is_file_like,
_validate_header_arg, _get_handle,
UnicodeReader, UTF8Recoder, _NA_VALUES,
BaseIterator, _infer_compression)
from pandas.core.tools import datetimes as tools

from pandas.util._decorators import Appender
Expand Down Expand Up @@ -755,7 +755,9 @@ def __init__(self, f, engine=None, **kwds):
self.squeeze = options.pop('squeeze', False)

# might mutate self.engine
self.engine = self._check_file_or_buffer(f, engine)
self.options, self.engine = self._clean_options(options, engine)

if 'has_index_names' in kwds:
self.options['has_index_names'] = kwds['has_index_names']

Expand Down Expand Up @@ -801,6 +803,23 @@ def _get_options_with_defaults(self, engine):

return options

def _check_file_or_buffer(self, f, engine):
# see gh-16530
if is_file_like(f):
next_attr = "__next__" if PY3 else "next"

# The C engine doesn't need the file-like to have the "next" or
# "__next__" attribute. However, the Python engine explicitly calls
# "next(...)" when iterating through such an object, meaning it
# needs to have that attribute ("next" for Python 2.x, "__next__"
# for Python 3.x)
if engine != "c" and not hasattr(f, next_attr):
msg = ("The 'python' engine cannot iterate "
"through this file buffer.")
raise ValueError(msg)

return engine

def _clean_options(self, options, engine):
result = options.copy()

Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/dtypes/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,9 +120,9 @@ class MockFile(object):
m = MockFile()
assert not is_file(m)

# gh-16530: Valid iterator just means we have the
# __iter__ attribute for our purposes.
MockFile.__iter__ = lambda self: self
MockFile.__next__ = lambda self: 0
MockFile.next = MockFile.__next__

# Valid write-only file
m = MockFile()
Expand Down
36 changes: 36 additions & 0 deletions pandas/tests/io/parser/c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
further arguments when parsing.
"""

import os
import sys
import tarfile

import pytest
import numpy as np
Expand Down Expand Up @@ -446,3 +448,37 @@ def test_comment_whitespace_delimited(self):
[7, np.nan],
[8, np.nan]])
tm.assert_frame_equal(df, expected)

def test_file_like_no_next(self):
# gh-16530: the file-like need not have a "next" or "__next__"
# attribute despite having an "__iter__" attribute.
#
# NOTE: This is only true for the C engine, not Python engine.
class NoNextBuffer(StringIO):
def __next__(self):
raise AttributeError("No next method")

next = __next__

data = "a\n1"

expected = pd.DataFrame({"a": [1]})
result = self.read_csv(NoNextBuffer(data))

tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("tar_suffix", [".tar", ".tar.gz"])
def test_read_tarfile(self, tar_suffix):
# see gh-16530
#
# Unfortunately, Python's CSV library can't handle
# tarfile objects (expects string, not bytes when
# iterating through a file-like).
tar_path = os.path.join(self.dirpath, "tar_csv" + tar_suffix)

tar = tarfile.open(tar_path, "r")
data_file = tar.extractfile("tar_data.csv")

out = self.read_csv(data_file)
expected = pd.DataFrame({"a": [1]})
tm.assert_frame_equal(out, expected)
Binary file added pandas/tests/io/parser/data/tar_csv.tar
Binary file not shown.
Binary file added pandas/tests/io/parser/data/tar_csv.tar.gz
Binary file not shown.
41 changes: 32 additions & 9 deletions pandas/tests/io/parser/test_unsupported.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,13 @@
from pandas.errors import ParserError
from pandas.io.parsers import read_csv, read_table

import pytest


@pytest.fixture(params=["python", "python-fwf"], ids=lambda val: val)
def python_engine(request):
return request.param


class TestUnsupportedFeatures(object):

Expand Down Expand Up @@ -82,24 +89,40 @@ def test_c_engine(self):
with tm.assert_raises_regex(ValueError, msg):
read_csv(StringIO(data), lineterminator='~~')

def test_python_engine(self):
def test_python_engine(self, python_engine):
from pandas.io.parsers import _python_unsupported as py_unsupported

data = """1,2,3,,
1,2,3,4,
1,2,3,4,5
1,2,,,
1,2,3,4,"""
engines = 'python', 'python-fwf'

for engine in engines:
for default in py_unsupported:
msg = ('The %r option is not supported '
'with the %r engine' % (default, engine))
for default in py_unsupported:
msg = ('The %r option is not supported '
'with the %r engine' % (default, python_engine))

kwargs = {default: object()}
with tm.assert_raises_regex(ValueError, msg):
read_csv(StringIO(data), engine=python_engine, **kwargs)

kwargs = {default: object()}
with tm.assert_raises_regex(ValueError, msg):
read_csv(StringIO(data), engine=engine, **kwargs)
def test_python_engine_file_no_next(self, python_engine):
# see gh-16530
class NoNextBuffer(object):
def __init__(self, csv_data):
self.data = csv_data

def __iter__(self):
return self

def read(self):
return self.data

data = "a\n1"
msg = "The 'python' engine cannot iterate"

with tm.assert_raises_regex(ValueError, msg):
read_csv(NoNextBuffer(data), engine=python_engine)


class TestDeprecatedFeatures(object):
Expand Down
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -702,6 +702,8 @@ def pxd(name):
'parser/data/*.gz',
'parser/data/*.bz2',
'parser/data/*.txt',
'parser/data/*.tar',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add with .tar.gz as well?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I meant test with .tar.gz as well.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I see. Done.

'parser/data/*.tar.gz',
'sas/data/*.csv',
'sas/data/*.xpt',
'sas/data/*.sas7bdat',
Expand Down