diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 2db03724e564d..23f2589adde89 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -246,6 +246,7 @@ Other API Changes - ``DataFrame.applymap()`` with an empty ``DataFrame`` will return a copy of the empty ``DataFrame`` instead of a ``Series`` (:issue:`8222`) - ``pd.read_csv()`` will now issue a ``ParserWarning`` whenever there are conflicting values provided by the ``dialect`` parameter and the user (:issue:`14898`) +- ``pd.read_csv()`` will now raise a ``ValueError`` for the C engine if the quote character is larger than than one byte (:issue:`11592`) .. _whatsnew_0200.deprecations: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 040ec3d803303..8a9873b240602 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -841,6 +841,17 @@ def _clean_options(self, options, engine): encoding=encoding) engine = 'python' + quotechar = options['quotechar'] + if (quotechar is not None and + isinstance(quotechar, (str, compat.text_type, bytes))): + if (len(quotechar) == 1 and ord(quotechar) > 127 and + engine not in ('python', 'python-fwf')): + fallback_reason = ("ord(quotechar) > 127, meaning the " + "quotechar is larger than one byte, " + "and the 'c' engine does not support " + "such quotechars") + engine = 'python' + if fallback_reason and engine_specified: raise ValueError(fallback_reason) diff --git a/pandas/io/tests/parser/quoting.py b/pandas/io/tests/parser/quoting.py index 765cec8243a0a..a692e03e868c7 100644 --- a/pandas/io/tests/parser/quoting.py +++ b/pandas/io/tests/parser/quoting.py @@ -149,5 +149,5 @@ def test_quotechar_unicode(self): # Compared to Python 3.x, Python 2.x does not handle unicode well. if PY3: - result = self.read_csv(StringIO(data), quotechar=u('\u0394')) + result = self.read_csv(StringIO(data), quotechar=u('\u0001')) tm.assert_frame_equal(result, expected) diff --git a/pandas/io/tests/parser/test_unsupported.py b/pandas/io/tests/parser/test_unsupported.py index 64f31a11440d8..4d93df16a0279 100644 --- a/pandas/io/tests/parser/test_unsupported.py +++ b/pandas/io/tests/parser/test_unsupported.py @@ -50,12 +50,16 @@ def test_c_engine(self): sep=None, delim_whitespace=False) with tm.assertRaisesRegexp(ValueError, msg): read_table(StringIO(data), engine='c', sep=r'\s') + with tm.assertRaisesRegexp(ValueError, msg): + read_table(StringIO(data), engine='c', quotechar=chr(128)) with tm.assertRaisesRegexp(ValueError, msg): read_table(StringIO(data), engine='c', skipfooter=1) # specify C-unsupported options without python-unsupported options with tm.assert_produces_warning(parsers.ParserWarning): read_table(StringIO(data), sep=None, delim_whitespace=False) + with tm.assert_produces_warning(parsers.ParserWarning): + read_table(StringIO(data), quotechar=chr(128)) with tm.assert_produces_warning(parsers.ParserWarning): read_table(StringIO(data), sep=r'\s') with tm.assert_produces_warning(parsers.ParserWarning):