diff --git a/doc/source/release.rst b/doc/source/release.rst index 35c22fdf03d9a..fce4b6a5e47eb 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -56,12 +56,16 @@ New features API Changes ~~~~~~~~~~~ + Experimental Features ~~~~~~~~~~~~~~~~~~~~~ Improvements to existing features ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +- pd.read_clipboard will, if 'sep' is unspecified, try to detect data copied from a spreadsheet + and parse accordingly. (:issue:`6223`) + .. _release.bug_fixes-0.14.0: Bug Fixes diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 3ae33a909eca4..879e6466611cb 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -28,10 +28,13 @@ There are no deprecations of prior behavior in 0.14.0 Enhancements ~~~~~~~~~~~~ +- pd.read_clipboard will, if 'sep' is unspecified, try to detect data copied from a spreadsheet + and parse accordingly. (:issue:`6223`) + + Performance ~~~~~~~~~~~ - Experimental ~~~~~~~~~~~~ diff --git a/pandas/io/clipboard.py b/pandas/io/clipboard.py index e90d9ddef707a..52d950ef5b598 100644 --- a/pandas/io/clipboard.py +++ b/pandas/io/clipboard.py @@ -14,12 +14,29 @@ def read_clipboard(**kwargs): # pragma: no cover ------- parsed : DataFrame """ - if kwargs.get('sep') is None and kwargs.get('delim_whitespace') is None: - kwargs['sep'] = '\s+' from pandas.util.clipboard import clipboard_get from pandas.io.parsers import read_table text = clipboard_get() + # Excel copies into clipboard with \t seperation + # inspect no more then the 10 first lines, if they + # all contain an equal number (>0) of tabs, infer + # that this came from excel and set 'sep' accordingly + lines = text[:10000].split('\n')[:-1][:10] + + # Need to remove leading white space, since read_table + # accepts: + # a b + # 0 1 2 + # 1 3 4 + + counts = set([x.lstrip().count('\t') for x in lines]) + if len(lines)>1 and len(counts) == 1 and counts.pop() != 0: + kwargs['sep'] = '\t' + + if kwargs.get('sep') is None and kwargs.get('delim_whitespace') is None: + kwargs['sep'] = '\s+' + # try to decode (if needed on PY3) if compat.PY3: try: diff --git a/pandas/io/tests/test_clipboard.py b/pandas/io/tests/test_clipboard.py index 3556dfd999d40..482c81fc8e7c0 100644 --- a/pandas/io/tests/test_clipboard.py +++ b/pandas/io/tests/test_clipboard.py @@ -2,6 +2,7 @@ from numpy.random import randint import nose +import pandas as pd from pandas import DataFrame from pandas import read_clipboard @@ -65,3 +66,37 @@ def test_round_trip_frame_string(self): def test_round_trip_frame(self): for dt in self.data_types: self.check_round_trip_frame(dt) + + def test_read_clipboard_infer_excel(self): + from textwrap import dedent + from pandas.util.clipboard import clipboard_set + + text = dedent(""" + John James Charlie Mingus + 1 2 + 4 Harry Carney + """.strip()) + clipboard_set(text) + df = pd.read_clipboard() + + # excel data is parsed correctly + self.assertEqual(df.iloc[1][1], 'Harry Carney') + + # having diff tab counts doesn't trigger it + text = dedent(""" + a\t b + 1 2 + 3 4 + """.strip()) + clipboard_set(text) + res = pd.read_clipboard() + + text = dedent(""" + a b + 1 2 + 3 4 + """.strip()) + clipboard_set(text) + exp = pd.read_clipboard() + + tm.assert_frame_equal(res, exp)