diff --git a/doc/source/text.rst b/doc/source/text.rst index bcbf7f0ef78d7..02fa2d882f8b1 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -247,27 +247,87 @@ Missing values on either side will result in missing values in the result as wel s.str.cat(t) s.str.cat(t, na_rep='-') -Series are *not* aligned on their index before concatenation: +Concatenating a Series and something array-like into a Series +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. versionadded:: 0.23.0 + +The parameter ``others`` can also be two-dimensional. In this case, the number or rows must match the lengths of the calling ``Series`` (or ``Index``). .. ipython:: python - u = pd.Series(['b', 'd', 'e', 'c'], index=[1, 3, 4, 2]) - # without alignment + d = pd.concat([t, s], axis=1) + s + d + s.str.cat(d, na_rep='-') + +Concatenating a Series and an indexed object into a Series, with alignment +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. versionadded:: 0.23.0 + +For concatenation with a ``Series`` or ``DataFrame``, it is possible to align the indexes before concatenation by setting +the ``join``-keyword. + +.. ipython:: python + + u = pd.Series(['b', 'd', 'a', 'c'], index=[1, 3, 0, 2]) + s + u s.str.cat(u) - # with separate alignment - v, w = s.align(u) - v.str.cat(w, na_rep='-') + s.str.cat(u, join='left') + +.. warning:: + + If the ``join`` keyword is not passed, the method :meth:`~Series.str.cat` will currently fall back to the behavior before version 0.23.0 (i.e. no alignment), + but a ``FutureWarning`` will be raised if any of the involved indexes differ, since this default will change to ``join='left'`` in a future version. + +The usual options are available for ``join`` (one of ``'left', 'outer', 'inner', 'right'``). +In particular, alignment also means that the different lengths do not need to coincide anymore. + +.. ipython:: python + + v = pd.Series(['z', 'a', 'b', 'd', 'e'], index=[-1, 0, 1, 3, 4]) + s + v + s.str.cat(v, join='left', na_rep='-') + s.str.cat(v, join='outer', na_rep='-') + +The same alignment can be used when ``others`` is a ``DataFrame``: + +.. ipython:: python + + f = d.loc[[3, 2, 1, 0], :] + s + f + s.str.cat(f, join='left', na_rep='-') Concatenating a Series and many objects into a Series ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -List-likes (excluding iterators, ``dict``-views, etc.) can be arbitrarily combined in a list. -All elements of the list must match in length to the calling ``Series`` (resp. ``Index``): +All one-dimensional list-likes can be arbitrarily combined in a list-like container (including iterators, ``dict``-views, etc.): + +.. ipython:: python + + s + u + s.str.cat([u, pd.Index(u.values), ['A', 'B', 'C', 'D'], map(int, u.index)], na_rep='-') + +All elements must match in length to the calling ``Series`` (or ``Index``), except those having an index if ``join`` is not None: + +.. ipython:: python + + v + s.str.cat([u, v, ['A', 'B', 'C', 'D']], join='outer', na_rep='-') + +If using ``join='right'`` on a list of ``others`` that contains different indexes, +the union of these indexes will be used as the basis for the final concatenation: .. ipython:: python - x = pd.Series([1, 2, 3, 4], index=['A', 'B', 'C', 'D']) - s.str.cat([['A', 'B', 'C', 'D'], s, s.values, x.index]) + u.loc[[3]] + v.loc[[-1, 0]] + s.str.cat([u.loc[[3]], v.loc[[-1, 0]]], join='right', na_rep='-') Indexing with ``.str`` ---------------------- diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index c194d98a89789..d3746d9e0b61e 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -308,6 +308,24 @@ The :func:`DataFrame.assign` now accepts dependent keyword arguments for python df.assign(A=df.A+1, C= lambda df: df.A* -1) +.. _whatsnew_0230.enhancements.str_cat_align: + +``Series.str.cat`` has gained the ``join`` kwarg +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously, :meth:`Series.str.cat` did not -- in contrast to most of ``pandas`` -- align :class:`Series` on their index before concatenation (see :issue:`18657`). +The method has now gained a keyword ``join`` to control the manner of alignment, see examples below and in :ref:`here `. + +In v.0.23 `join` will default to None (meaning no alignment), but this default will change to ``'left'`` in a future version of pandas. + +.. ipython:: python + + s = pd.Series(['a', 'b', 'c', 'd']) + t = pd.Series(['b', 'd', 'e', 'c'], index=[1, 3, 4, 2]) + s.str.cat(t) + s.str.cat(t, join='left', na_rep='-') + +Furthermore, meth:`Series.str.cat` now works for ``CategoricalIndex`` as well (previously raised a ``ValueError``; see :issue:`20842`). .. _whatsnew_0230.enhancements.astype_category: diff --git a/pandas/core/strings.py b/pandas/core/strings.py index c6d45ce5413ac..da4845fcdfc8e 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -65,7 +65,7 @@ def _get_array_list(arr, others): def str_cat(arr, others=None, sep=None, na_rep=None): """ - Concatenate strings in the Series/Index with given separator. + Auxiliary function for :meth:`str.cat` If `others` is specified, this function concatenates the Series/Index and elements of `others` element-wise. @@ -84,42 +84,9 @@ def str_cat(arr, others=None, sep=None, na_rep=None): Returns ------- - concat : Series/Index of objects or str - - See Also - -------- - split : Split each string in the Series/Index - - Examples - -------- - When not passing `other`, all values are concatenated into a single - string: - - >>> s = pd.Series(['a', 'b', np.nan, 'c']) - >>> s.str.cat(sep=' ') - 'a b c' - - By default, NA values in the Series are ignored. Using `na_rep`, they - can be given a representation: - - >>> pd.Series(['a', 'b', np.nan, 'c']).str.cat(sep=' ', na_rep='?') - 'a b ? c' - - If `others` is specified, corresponding values are - concatenated with the separator. Result will be a Series of strings. - - >>> pd.Series(['a', 'b', 'c']).str.cat(['A', 'B', 'C'], sep=',') - 0 a,A - 1 b,B - 2 c,C - dtype: object - - Also, you can pass a list of list-likes. - - >>> pd.Series(['a', 'b']).str.cat([['x', 'y'], ['1', '2']], sep=',') - 0 a,x,1 - 1 b,y,2 - dtype: object + concat + ndarray containing concatenated results (if `others is not None`) + or str (if `others is None`) """ if sep is None: sep = '' @@ -174,7 +141,7 @@ def _length_check(others): elif len(x) != n: raise ValueError('All arrays must be same length') except TypeError: - raise ValueError("Did you mean to supply a `sep` keyword?") + raise ValueError('Must pass arrays containing strings to str_cat') return n @@ -1833,7 +1800,9 @@ class StringMethods(NoNewAttributesMixin): def __init__(self, data): self._validate(data) self._is_categorical = is_categorical_dtype(data) - self._data = data.cat.categories if self._is_categorical else data + + # .values.categories works for both Series/Index + self._data = data.values.categories if self._is_categorical else data # save orig to blow up categoricals to the right type self._orig = data self._freeze() @@ -1859,7 +1828,11 @@ def _validate(data): # see src/inference.pyx which can contain string values allowed_types = ('string', 'unicode', 'mixed', 'mixed-integer') - if data.inferred_type not in allowed_types: + if is_categorical_dtype(data.dtype): + inf_type = data.categories.inferred_type + else: + inf_type = data.inferred_type + if inf_type not in allowed_types: message = ("Can only use .str accessor with string values " "(i.e. inferred_type is 'string', 'unicode' or " "'mixed')") @@ -1962,11 +1935,298 @@ def cons_row(x): cons = self._orig._constructor return cons(result, name=name, index=index) - @copy(str_cat) - def cat(self, others=None, sep=None, na_rep=None): - data = self._orig if self._is_categorical else self._data - result = str_cat(data, others=others, sep=sep, na_rep=na_rep) - return self._wrap_result(result, use_codes=(not self._is_categorical)) + def _get_series_list(self, others, ignore_index=False): + """ + Auxiliary function for :meth:`str.cat`. Turn potentially mixed input + into a list of Series (elements without an index must match the length + of the calling Series/Index). + + Parameters + ---------- + input : Series, DataFrame, np.ndarray, list-like or list-like of + objects that are either Series, np.ndarray (1-dim) or list-like + ignore_index : boolean, default False + Determines whether to forcefully align with index of the caller + + Returns + ------- + tuple : (input transformed into list of Series, + Boolean whether FutureWarning should be raised) + """ + + # once str.cat defaults to alignment, this function can be simplified; + # will not need `ignore_index` and the second boolean output anymore + + from pandas import Index, Series, DataFrame, isnull + + # self._orig is either Series or Index + idx = self._orig if isinstance(self._orig, Index) else self._orig.index + + err_msg = ('others must be Series, Index, DataFrame, np.ndarrary or ' + 'list-like (either containing only strings or containing ' + 'only objects of type Series/Index/list-like/np.ndarray)') + + if isinstance(others, Series): + fu_wrn = not others.index.equals(idx) + los = [Series(others.values, index=idx) + if ignore_index and fu_wrn else others] + return (los, fu_wrn) + elif isinstance(others, Index): + fu_wrn = not others.equals(idx) + los = [Series(others.values, + index=(idx if ignore_index else others))] + return (los, fu_wrn) + elif isinstance(others, DataFrame): + fu_wrn = not others.index.equals(idx) + if ignore_index and fu_wrn: + # without copy, this could change "others" + # that was passed to str.cat + others = others.copy() + others.index = idx + return ([others[x] for x in others], fu_wrn) + elif isinstance(others, np.ndarray) and others.ndim == 2: + others = DataFrame(others, index=idx) + return ([others[x] for x in others], False) + elif is_list_like(others): + others = list(others) # ensure iterators do not get read twice etc + if all(is_list_like(x) for x in others): + los = [] + fu_wrn = False + while others: + nxt = others.pop(0) # list-like as per check above + # safety for iterators and other non-persistent list-likes + # do not map indexed/typed objects; would lose information + if not isinstance(nxt, (DataFrame, Series, + Index, np.ndarray)): + nxt = list(nxt) + + # known types without deep inspection + no_deep = ((isinstance(nxt, np.ndarray) and nxt.ndim == 1) + or isinstance(nxt, (Series, Index))) + # Nested list-likes are forbidden - elements of nxt must be + # strings/NaN/None. Need to robustify NaN-check against + # x in nxt being list-like (otherwise ambiguous boolean) + is_legal = ((no_deep and nxt.dtype == object) + or all((isinstance(x, compat.string_types) + or (not is_list_like(x) and isnull(x)) + or x is None) + for x in nxt)) + # DataFrame is false positive of is_legal + # because "x in df" returns column names + if not is_legal or isinstance(nxt, DataFrame): + raise TypeError(err_msg) + + nxt, fwn = self._get_series_list(nxt, + ignore_index=ignore_index) + los = los + nxt + fu_wrn = fu_wrn or fwn + return (los, fu_wrn) + # test if there is a mix of list-like and non-list-like (e.g. str) + elif (any(is_list_like(x) for x in others) + and any(not is_list_like(x) for x in others)): + raise TypeError(err_msg) + else: # all elements in others are _not_ list-like + return ([Series(others, index=idx)], False) + raise TypeError(err_msg) + + def cat(self, others=None, sep=None, na_rep=None, join=None): + """ + Concatenate strings in the Series/Index with given separator. + + If `others` is specified, this function concatenates the Series/Index + and elements of `others` element-wise. + If `others` is not passed, then all values in the Series/Index are + concatenated into a single string with a given `sep`. + + Parameters + ---------- + others : Series, Index, DataFrame, np.ndarrary or list-like + Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and + other list-likes of strings must have the same length as the + calling Series/Index, with the exception of indexed objects (i.e. + Series/Index/DataFrame) if `join` is not None. + + If others is a list-like that contains a combination of Series, + np.ndarray (1-dim) or list-like, then all elements will be unpacked + and must satisfy the above criteria individually. + + If others is None, the method returns the concatenation of all + strings in the calling Series/Index. + sep : string or None, default None + If None, concatenates without any separator. + na_rep : string or None, default None + Representation that is inserted for all missing values: + + - If `na_rep` is None, and `others` is None, missing values in the + Series/Index are omitted from the result. + - If `na_rep` is None, and `others` is not None, a row containing a + missing value in any of the columns (before concatenation) will + have a missing value in the result. + join : {'left', 'right', 'outer', 'inner'}, default None + Determines the join-style between the calling Series/Index and any + Series/Index/DataFrame in `others` (objects without an index need + to match the length of the calling Series/Index). If None, + alignment is disabled, but this option will be removed in a future + version of pandas and replaced with a default of `'left'`. To + disable alignment, use `.values` on any Series/Index/DataFrame in + `others`. + + .. versionadded:: 0.23.0 + + Returns + ------- + concat : str if `other is None`, Series/Index of objects if `others is + not None`. In the latter case, the result will remain categorical + if the calling Series/Index is categorical. + + See Also + -------- + split : Split each string in the Series/Index + + Examples + -------- + When not passing `others`, all values are concatenated into a single + string: + + >>> s = pd.Series(['a', 'b', np.nan, 'd']) + >>> s.str.cat(sep=' ') + 'a b d' + + By default, NA values in the Series are ignored. Using `na_rep`, they + can be given a representation: + + >>> s.str.cat(sep=' ', na_rep='?') + 'a b ? d' + + If `others` is specified, corresponding values are concatenated with + the separator. Result will be a Series of strings. + + >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',') + 0 a,A + 1 b,B + 2 NaN + 3 d,D + dtype: object + + Missing values will remain missing in the result, but can again be + represented using `na_rep` + + >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-') + 0 a,A + 1 b,B + 2 -,C + 3 d,D + dtype: object + + If `sep` is not specified, the values are concatenated without + separation. + + >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-') + 0 aA + 1 bB + 2 -C + 3 dD + dtype: object + + Series with different indexes can be aligned before concatenation. The + `join`-keyword works as in other methods. + + >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2]) + >>> s.str.cat(t, join=None, na_rep='-') + 0 ad + 1 ba + 2 -e + 3 dc + dtype: object + >>> + >>> s.str.cat(t, join='left', na_rep='-') + 0 aa + 1 b- + 2 -c + 3 dd + dtype: object + >>> + >>> s.str.cat(t, join='outer', na_rep='-') + 0 aa + 1 b- + 2 -c + 3 dd + 4 -e + dtype: object + >>> + >>> s.str.cat(t, join='inner', na_rep='-') + 0 aa + 2 -c + 3 dd + dtype: object + >>> + >>> s.str.cat(t, join='right', na_rep='-') + 3 dd + 0 aa + 4 -e + 2 -c + dtype: object + + For more examples, see :ref:`here `. + """ + from pandas import Index, Series, concat + + if isinstance(others, compat.string_types): + raise ValueError("Did you mean to supply a `sep` keyword?") + + if isinstance(self._orig, Index): + data = Series(self._orig, index=self._orig) + else: # Series + data = self._orig + + # concatenate Series/Index with itself if no "others" + if others is None: + result = str_cat(data, others=others, sep=sep, na_rep=na_rep) + return self._wrap_result(result, + use_codes=(not self._is_categorical)) + + try: + # turn anything in "others" into lists of Series + others, fu_wrn = self._get_series_list(others, + ignore_index=(join is None)) + except ValueError: # do not catch TypeError raised by _get_series_list + if join is None: + raise ValueError('All arrays must be same length, except ' + 'those having an index if `join` is not None') + else: + raise ValueError('If `others` contains arrays or lists (or ' + 'other list-likes without an index), these ' + 'must all be of the same length as the ' + 'calling Series/Index.') + + if join is None and fu_wrn: + warnings.warn("A future version of pandas will perform index " + "alignment when `others` is a Series/Index/" + "DataFrame (or a list-like containing one). To " + "disable alignment (the behavior before v.0.23) and " + "silence this warning, use `.values` on any Series/" + "Index/DataFrame in `others`. To enable alignment " + "and silence this warning, pass `join='left'|" + "'outer'|'inner'|'right'`. The future default will " + "be `join='left'`.", FutureWarning, stacklevel=2) + + # align if required + if join is not None: + # Need to add keys for uniqueness in case of duplicate columns + others = concat(others, axis=1, + join=(join if join == 'inner' else 'outer'), + keys=range(len(others))) + data, others = data.align(others, join=join) + others = [others[x] for x in others] # again list of Series + + # str_cat discards index + res = str_cat(data, others=others, sep=sep, na_rep=na_rep) + + if isinstance(self._orig, Index): + res = Index(res) + else: # Series + res = Series(res, index=data.index) + return res @copy(str_split) def split(self, pat=None, n=-1, expand=False): diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index ac8d269c75f52..1a978cbf6363f 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -11,14 +11,21 @@ from pandas.compat import range, u import pandas.compat as compat -from pandas import Index, Series, DataFrame, isna, MultiIndex, notna +from pandas import Index, Series, DataFrame, isna, MultiIndex, notna, concat -from pandas.util.testing import assert_series_equal +from pandas.util.testing import assert_series_equal, assert_index_equal import pandas.util.testing as tm import pandas.core.strings as strings +def assert_series_or_index_equal(left, right): + if isinstance(left, Series): + assert_series_equal(left, right) + else: # Index + assert_index_equal(left, right) + + class TestStringMethods(object): def test_api(self): @@ -125,6 +132,330 @@ def test_cat(self): exp = np.array(['aa', NA, 'bb', 'bd', 'cfoo', NA], dtype=np.object_) tm.assert_almost_equal(result, exp) + # error for incorrect lengths + rgx = 'All arrays must be same length' + three = Series(['1', '2', '3']) + + with tm.assert_raises_regex(ValueError, rgx): + strings.str_cat(one, three) + + # error for incorrect type + rgx = "Must pass arrays containing strings to str_cat" + with tm.assert_raises_regex(ValueError, rgx): + strings.str_cat(one, 'three') + + @pytest.mark.parametrize('series_or_index', ['series', 'index']) + def test_str_cat(self, series_or_index): + # test_cat above tests "str_cat" from ndarray to ndarray; + # here testing "str.cat" from Series/Index to Series/Index/ndarray/list + s = Index(['a', 'a', 'b', 'b', 'c', np.nan]) + if series_or_index == 'series': + s = Series(s) + t = Index(['a', np.nan, 'b', 'd', 'foo', np.nan]) + + # single array + result = s.str.cat() + exp = 'aabbc' + assert result == exp + + result = s.str.cat(na_rep='-') + exp = 'aabbc-' + assert result == exp + + result = s.str.cat(sep='_', na_rep='NA') + exp = 'a_a_b_b_c_NA' + assert result == exp + + # Series/Index with Index + exp = Index(['aa', 'a-', 'bb', 'bd', 'cfoo', '--']) + if series_or_index == 'series': + exp = Series(exp) + # s.index / s is different from t (as Index) -> warning + with tm.assert_produces_warning(expected_warning=FutureWarning): + # FutureWarning to switch to alignment by default + assert_series_or_index_equal(s.str.cat(t, na_rep='-'), exp) + + # Series/Index with Series + t = Series(t) + # s as Series has same index as t -> no warning + # s as Index is different from t.index -> warning (tested below) + if series_or_index == 'series': + assert_series_equal(s.str.cat(t, na_rep='-'), exp) + + # Series/Index with Series: warning if different indexes + t.index = t.index + 1 + with tm.assert_produces_warning(expected_warning=FutureWarning): + # FutureWarning to switch to alignment by default + assert_series_or_index_equal(s.str.cat(t, na_rep='-'), exp) + + # Series/Index with array + assert_series_or_index_equal(s.str.cat(t.values, na_rep='-'), exp) + + # Series/Index with list + assert_series_or_index_equal(s.str.cat(list(t), na_rep='-'), exp) + + # errors for incorrect lengths + rgx = 'All arrays must be same length, except.*' + z = Series(['1', '2', '3']) + + with tm.assert_raises_regex(ValueError, rgx): + s.str.cat(z) + + with tm.assert_raises_regex(ValueError, rgx): + s.str.cat(z.values) + + with tm.assert_raises_regex(ValueError, rgx): + s.str.cat(list(z)) + + @pytest.mark.parametrize('series_or_index', ['series', 'index']) + def test_str_cat_raises_intuitive_error(self, series_or_index): + # https://github.com/pandas-dev/pandas/issues/11334 + s = Index(['a', 'b', 'c', 'd']) + if series_or_index == 'series': + s = Series(s) + message = "Did you mean to supply a `sep` keyword?" + with tm.assert_raises_regex(ValueError, message): + s.str.cat('|') + with tm.assert_raises_regex(ValueError, message): + s.str.cat(' ') + + @pytest.mark.parametrize('series_or_index, dtype_caller, dtype_target', [ + ('series', 'object', 'object'), + ('series', 'object', 'category'), + ('series', 'category', 'object'), + ('series', 'category', 'category'), + ('index', 'object', 'object'), + ('index', 'object', 'category'), + ('index', 'category', 'object'), + ('index', 'category', 'category') + ]) + def test_str_cat_categorical(self, series_or_index, + dtype_caller, dtype_target): + s = Index(['a', 'a', 'b', 'a'], dtype=dtype_caller) + if series_or_index == 'series': + s = Series(s) + t = Index(['b', 'a', 'b', 'c'], dtype=dtype_target) + + exp = Index(['ab', 'aa', 'bb', 'ac']) + if series_or_index == 'series': + exp = Series(exp) + + # Series/Index with Index + # s.index / s is different from t (as Index) -> warning + with tm.assert_produces_warning(expected_warning=FutureWarning): + # FutureWarning to switch to alignment by default + assert_series_or_index_equal(s.str.cat(t), exp) + + # Series/Index with Series + t = Series(t) + # s as Series has same index as t -> no warning + # s as Index is different from t.index -> warning (tested below) + if series_or_index == 'series': + assert_series_equal(s.str.cat(t), exp) + + # Series/Index with Series: warning if different indexes + t.index = t.index + 1 + with tm.assert_produces_warning(expected_warning=FutureWarning): + # FutureWarning to switch to alignment by default + assert_series_or_index_equal(s.str.cat(t, na_rep='-'), exp) + + @pytest.mark.parametrize('series_or_index', ['series', 'index']) + def test_str_cat_mixed_inputs(self, series_or_index): + s = Index(['a', 'b', 'c', 'd']) + if series_or_index == 'series': + s = Series(s) + t = Series(['A', 'B', 'C', 'D']) + d = concat([t, Series(s)], axis=1) + + exp = Index(['aAa', 'bBb', 'cCc', 'dDd']) + if series_or_index == 'series': + exp = Series(exp) + + # Series/Index with DataFrame + # s as Series has same index as d -> no warning + # s as Index is different from d.index -> warning (tested below) + if series_or_index == 'series': + assert_series_equal(s.str.cat(d), exp) + + # Series/Index with DataFrame: warning if different indexes + d.index = d.index + 1 + with tm.assert_produces_warning(expected_warning=FutureWarning): + # FutureWarning to switch to alignment by default + assert_series_or_index_equal(s.str.cat(d), exp) + + # Series/Index with two-dimensional ndarray + assert_series_or_index_equal(s.str.cat(d.values), exp) + + # Series/Index with list of Series + # s as Series has same index as t, s -> no warning + # s as Index is different from t.index -> warning (tested below) + if series_or_index == 'series': + assert_series_equal(s.str.cat([t, s]), exp) + + # Series/Index with list of Series: warning if different indexes + tt = t.copy() + tt.index = tt.index + 1 + with tm.assert_produces_warning(expected_warning=FutureWarning): + # FutureWarning to switch to alignment by default + assert_series_or_index_equal(s.str.cat([tt, s]), exp) + + # Series/Index with list of list-likes + assert_series_or_index_equal(s.str.cat([t.values, list(s)]), exp) + + # Series/Index with mixed list of Series/list-like + # s as Series has same index as t -> no warning + # s as Index is different from t.index -> warning (tested below) + if series_or_index == 'series': + assert_series_equal(s.str.cat([t, s.values]), exp) + + # Series/Index with mixed list: warning if different indexes + with tm.assert_produces_warning(expected_warning=FutureWarning): + # FutureWarning to switch to alignment by default + assert_series_or_index_equal(s.str.cat([tt, s.values]), exp) + + # Series/Index with iterator of list-likes + assert_series_or_index_equal(s.str.cat(iter([t.values, list(s)])), exp) + + # errors for incorrect lengths + rgx = 'All arrays must be same length, except.*' + z = Series(['1', '2', '3']) + e = concat([z, z], axis=1) + + # DataFrame + with tm.assert_raises_regex(ValueError, rgx): + s.str.cat(e) + + # two-dimensional ndarray + with tm.assert_raises_regex(ValueError, rgx): + s.str.cat(e.values) + + # list of Series + with tm.assert_raises_regex(ValueError, rgx): + s.str.cat([z, s]) + + # list of list-likes + with tm.assert_raises_regex(ValueError, rgx): + s.str.cat([z.values, list(s)]) + + # mixed list of Series/list-like + with tm.assert_raises_regex(ValueError, rgx): + s.str.cat([z, list(s)]) + + # errors for incorrect arguments in list-like + rgx = 'others must be Series, Index, DataFrame,.*' + # make sure None/NaN do not crash checks in _get_series_list + u = Series(['a', np.nan, 'c', None]) + + # mix of string and Series + with tm.assert_raises_regex(TypeError, rgx): + s.str.cat([u, 'u']) + + # DataFrame in list + with tm.assert_raises_regex(TypeError, rgx): + s.str.cat([u, d]) + + # 2-dim ndarray in list + with tm.assert_raises_regex(TypeError, rgx): + s.str.cat([u, d.values]) + + # nested lists + with tm.assert_raises_regex(TypeError, rgx): + s.str.cat([u, [u, d]]) + + # forbidden input type, e.g. int + with tm.assert_raises_regex(TypeError, rgx): + s.str.cat(1) + + @pytest.mark.parametrize('series_or_index, join', [ + ('series', 'left'), ('series', 'outer'), + ('series', 'inner'), ('series', 'right'), + ('index', 'left'), ('index', 'outer'), + ('index', 'inner'), ('index', 'right') + ]) + def test_str_cat_align_indexed(self, series_or_index, join): + # https://github.com/pandas-dev/pandas/issues/18657 + s = Series(['a', 'b', 'c', 'd'], index=['a', 'b', 'c', 'd']) + t = Series(['D', 'A', 'E', 'B'], index=['d', 'a', 'e', 'b']) + sa, ta = s.align(t, join=join) + # result after manual alignment of inputs + exp = sa.str.cat(ta, na_rep='-') + + if series_or_index == 'index': + s = Index(s) + sa = Index(sa) + exp = Index(exp) + + assert_series_or_index_equal(s.str.cat(t, join=join, na_rep='-'), exp) + + @pytest.mark.parametrize('join', ['left', 'outer', 'inner', 'right']) + def test_str_cat_align_mixed_inputs(self, join): + s = Series(['a', 'b', 'c', 'd']) + t = Series(['d', 'a', 'e', 'b'], index=[3, 0, 4, 1]) + d = concat([t, t], axis=1) + + exp_outer = Series(['aaa', 'bbb', 'c--', 'ddd', '-ee']) + sa, ta = s.align(t, join=join) + exp = exp_outer.loc[ta.index] + + # list of Series + tm.assert_series_equal(s.str.cat([t, t], join=join, na_rep='-'), exp) + + # DataFrame + tm.assert_series_equal(s.str.cat(d, join=join, na_rep='-'), exp) + + # mixed list of indexed/unindexed + u = ['A', 'B', 'C', 'D'] + exp_outer = Series(['aaA', 'bbB', 'c-C', 'ddD', '-e-']) + # u will be forced have index of s -> use s here as placeholder + e = concat([t, s], axis=1, join=(join if join == 'inner' else 'outer')) + sa, ea = s.align(e, join=join) + exp = exp_outer.loc[ea.index] + tm.assert_series_equal(s.str.cat([t, u], join=join, na_rep='-'), exp) + + # errors for incorrect lengths + rgx = 'If `others` contains arrays or lists.*' + z = ['1', '2', '3'] + + # unindexed object of wrong length + with tm.assert_raises_regex(ValueError, rgx): + s.str.cat(z, join=join) + + # unindexed object of wrong length in list + with tm.assert_raises_regex(ValueError, rgx): + s.str.cat([t, z], join=join) + + def test_str_cat_special_cases(self): + s = Series(['a', 'b', 'c', 'd']) + t = Series(['d', 'a', 'e', 'b'], index=[3, 0, 4, 1]) + + # iterator of elements with different types + exp = Series(['aaA', 'bbB', 'c-C', 'ddD', '-e-']) + tm.assert_series_equal(s.str.cat(iter([t, ['A', 'B', 'C', 'D']]), + join='outer', na_rep='-'), exp) + + # right-align with different indexes in others + exp = Series(['aa-', 'd-d'], index=[0, 3]) + tm.assert_series_equal(s.str.cat([t.loc[[0]], t.loc[[3]]], + join='right', na_rep='-'), exp) + + def test_cat_on_filtered_index(self): + df = DataFrame(index=MultiIndex.from_product( + [[2011, 2012], [1, 2, 3]], names=['year', 'month'])) + + df = df.reset_index() + df = df[df.month > 1] + + str_year = df.year.astype('str') + str_month = df.month.astype('str') + str_both = str_year.str.cat(str_month, sep=' ', join='left') + + assert str_both.loc[1] == '2011 2' + + str_multiple = str_year.str.cat([str_month, str_month], + sep=' ', join='left') + + assert str_multiple.loc[1] == '2011 2 2' + def test_count(self): values = np.array(['foo', 'foofoo', NA, 'foooofooofommmfoo'], dtype=np.object_) @@ -1263,7 +1594,7 @@ def test_empty_str_methods(self): # GH7241 # (extract) on empty series - tm.assert_series_equal(empty_str, empty.str.cat(empty)) + tm.assert_series_equal(empty_str, empty.str.cat(empty, join='left')) assert '' == empty.str.cat() tm.assert_series_equal(empty_str, empty.str.title()) tm.assert_series_equal(empty_int, empty.str.count('a')) @@ -2772,32 +3103,6 @@ def test_normalize(self): result = s.str.normalize('NFKC') tm.assert_index_equal(result, expected) - def test_cat_on_filtered_index(self): - df = DataFrame(index=MultiIndex.from_product( - [[2011, 2012], [1, 2, 3]], names=['year', 'month'])) - - df = df.reset_index() - df = df[df.month > 1] - - str_year = df.year.astype('str') - str_month = df.month.astype('str') - str_both = str_year.str.cat(str_month, sep=' ') - - assert str_both.loc[1] == '2011 2' - - str_multiple = str_year.str.cat([str_month, str_month], sep=' ') - - assert str_multiple.loc[1] == '2011 2 2' - - def test_str_cat_raises_intuitive_error(self): - # https://github.com/pandas-dev/pandas/issues/11334 - s = Series(['a', 'b', 'c', 'd']) - message = "Did you mean to supply a `sep` keyword?" - with tm.assert_raises_regex(ValueError, message): - s.str.cat('|') - with tm.assert_raises_regex(ValueError, message): - s.str.cat(' ') - def test_index_str_accessor_visibility(self): from pandas.core.strings import StringMethods @@ -2857,9 +3162,9 @@ def test_method_on_bytes(self): lhs = Series(np.array(list('abc'), 'S1').astype(object)) rhs = Series(np.array(list('def'), 'S1').astype(object)) if compat.PY3: - pytest.raises(TypeError, lhs.str.cat, rhs) + pytest.raises(TypeError, lhs.str.cat, rhs, join='left') else: - result = lhs.str.cat(rhs) + result = lhs.str.cat(rhs, join='left') expected = Series(np.array( ['ad', 'be', 'cf'], 'S2').astype(object)) tm.assert_series_equal(result, expected)