BUG: address #590 and more concat tests with hierarchical index

wesm · wesm · commit e4f3fa83fbdb · 2012-01-07T18:57:09.000-05:00
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -512,7 +512,7 @@ def _stringify(col):
     else:
         return '%s' % col
 
-def _float_format_default(v, width = None):
+def _float_format_default(v, width=None):
     """
     Take a float and its formatted representation and if it needs extra space
     to fit the width, reformat it to that width.
@@ -565,30 +565,41 @@ def _float_format_default(v, width = None):
 
         return fmt_str % v
 
-def _format(s, space=None, na_rep=None, float_format=None, col_width=None):
+def _format(s, dtype, space=None, na_rep=None, float_format=None,
+            col_width=None):
     def _just_help(x):
         if space is None:
             return x
         return x[:space].ljust(space)
 
-    if isinstance(s, float):
-        if na_rep is not None and isnull(s):
-            if np.isnan(s):
-                s = na_rep
-            return _just_help('%s' % s)
+    def _make_float_format(x):
+        if na_rep is not None and isnull(x):
+            if np.isnan(x):
+                x = ' ' + na_rep
+            return _just_help('%s' % x)
 
         if float_format:
-            formatted = float_format(s)
+            formatted = float_format(x)
         elif _float_format:
-            formatted = _float_format(s)
+            formatted = _float_format(x)
         else:
-            formatted = _float_format_default(s, col_width)
+            formatted = _float_format_default(x, col_width)
 
         return _just_help(formatted)
-    elif isinstance(s, int):
-        return _just_help('% d' % s)
+
+    def _make_int_format(x):
+        return _just_help('% d' % x)
+
+    if is_float_dtype(dtype):
+        return _make_float_format(s)
+    elif is_integer_dtype(dtype):
+        return _make_int_format(s)
     else:
-        return _just_help('%s' % _stringify(s))
+        if na_rep is not None and lib.checknull(s):
+            return na_rep
+        else:
+            # object dtype
+            return _just_help('%s' % _stringify(s))
 
 #------------------------------------------------------------------------------
 # miscellaneous python tools
@@ -727,11 +738,19 @@ def is_integer(obj):
 def is_float(obj):
     return isinstance(obj, (float, np.floating))
 
-def is_integer_dtype(arr):
-    return issubclass(arr.dtype.type, np.integer)
+def is_integer_dtype(arr_or_dtype):
+    if isinstance(arr_or_dtype, np.dtype):
+        tipo = arr_or_dtype.type
+    else:
+        tipo = arr_or_dtype.dtype.type
+    return issubclass(tipo, np.integer)
 
-def is_float_dtype(arr):
-    return issubclass(arr.dtype.type, np.floating)
+def is_float_dtype(arr_or_dtype):
+    if isinstance(arr_or_dtype, np.dtype):
+        tipo = arr_or_dtype.type
+    else:
+        tipo = arr_or_dtype.dtype.type
+    return issubclass(tipo, np.floating)
 
 def save(obj, path):
     """
diff --git a/pandas/core/format.py b/pandas/core/format.py
@@ -99,11 +99,15 @@ def to_string(self):
 
         self.buf.writelines(to_write)
 
-    def _default_col_formatter(self, v, col_width=None):
+    def _get_col_formatter(self, dtype):
         from pandas.core.common import _format
 
-        return _format(v, space=self.col_space, na_rep=self.na_rep,
-                       float_format=self.float_format, col_width=col_width)
+        def formatter(x, col_width=None):
+            return _format(x, dtype, space=self.col_space,
+                           na_rep=self.na_rep,
+                           float_format=self.float_format,
+                           col_width=col_width)
+        return formatter
 
     def _format_col(self, col, i=None):
         if self.formatters is None:
@@ -117,7 +121,8 @@ def _format_col(self, col, i=None):
             else:
                 return formatter(self.frame[col][i])
         else:
-            formatter = self._default_col_formatter
+            dtype = self.frame[col].dtype
+            formatter = self._get_col_formatter(dtype)
 
             if i is not None:
                 return formatter(self.frame[col][i])
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -1567,18 +1567,18 @@ def test_repr(self):
         self.frame.reindex(columns=['A', 'B']).info(verbose=False, buf=buf)
 
         # big one
-        biggie = DataFrame(np.zeros((1000, 4)), columns=range(4),
-                            index=range(1000))
+        biggie = DataFrame(np.zeros((200, 4)), columns=range(4),
+                            index=range(200))
         foo = repr(biggie)
 
         # mixed
         foo = repr(self.mixed_frame)
         self.mixed_frame.info(verbose=False, buf=buf)
 
         # big mixed
-        biggie = DataFrame({'A' : randn(1000),
-                             'B' : tm.makeStringIndex(1000)},
-                            index=range(1000))
+        biggie = DataFrame({'A' : randn(200),
+                             'B' : tm.makeStringIndex(200)},
+                            index=range(200))
         biggie['A'][:20] = nan
         biggie['B'][:20] = nan
 
@@ -1675,9 +1675,9 @@ def test_to_string(self):
         import re
 
         # big mixed
-        biggie = DataFrame({'A' : randn(1000),
-                            'B' : tm.makeStringIndex(1000)},
-                            index=range(1000))
+        biggie = DataFrame({'A' : randn(200),
+                            'B' : tm.makeStringIndex(200)},
+                            index=range(200))
 
         biggie['A'][:20] = nan
         biggie['B'][:20] = nan
@@ -1717,7 +1717,7 @@ def test_to_string(self):
         biggie.to_string(columns=['B', 'A'], col_space=12,
                          float_format=str)
 
-        frame = DataFrame(index=np.arange(1000))
+        frame = DataFrame(index=np.arange(200))
         frame.to_string()
 
     def test_to_string_no_header(self):
@@ -1747,10 +1747,10 @@ def test_to_string_float_formatting(self):
 
         df_s = df.to_string()
 
-        expected = '   x       \n0  0.000000\n1  0.250000\n' \
-                   '2  3456.000\n3  1.20e+46\n4  1.64e+06\n' \
-                   '5  1.70e+08\n6  1.253456\n7  3.141593\n' \
-                   '8 -1.00e+06'
+        expected = ('   x       \n0  0.000000\n1  0.250000\n'
+                    '2  3456.000\n3  1.20e+46\n4  1.64e+06\n'
+                    '5  1.70e+08\n6  1.253456\n7  3.141593\n'
+                    '8 -1.00e+06')
         assert(df_s == expected)
 
         df = DataFrame({'x' : [3234, 0.253]})
@@ -1766,11 +1766,24 @@ def test_to_string_float_formatting(self):
         expected = '   x     \n0  1.e+09\n1  0.2512'
         assert(df_s == expected)
 
+    def test_to_string_format_na(self):
+        df = DataFrame({'A' : [np.nan, -1, -2.1234, 3, 4],
+                        'B' : [np.nan, 'foo', 'foooo', 'fooooo', 'bar']})
+        result = df.to_string()
+
+        expected = ('   A     B     \n'
+                    '0  NaN   NaN   \n'
+                    '1 -1.000 foo   \n'
+                    '2 -2.123 foooo \n'
+                    '3  3.000 fooooo\n'
+                    '4  4.000 bar   ')
+        self.assertEqual(result, expected)
+
     def test_to_html(self):
         # big mixed
-        biggie = DataFrame({'A' : randn(1000),
-                            'B' : tm.makeStringIndex(1000)},
-                            index=range(1000))
+        biggie = DataFrame({'A' : randn(200),
+                            'B' : tm.makeStringIndex(200)},
+                            index=range(200))
 
         biggie['A'][:20] = nan
         biggie['B'][:20] = nan
@@ -1791,7 +1804,7 @@ def test_to_html(self):
         biggie.to_html(columns=['B', 'A'], col_space=12,
                        float_format=str)
 
-        frame = DataFrame(index=np.arange(1000))
+        frame = DataFrame(index=np.arange(200))
         frame.to_html()
 
     def test_insert(self):
diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py
@@ -607,9 +607,17 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
     verify_integrity : boolean, default False
         Check whether the new concatenated axis contains duplicates. This can
         be very expensive relative to the actual data concatenation
-    keys : sequence-like or list of sequences
-    levels :
-    names :
+    keys : sequence, default None
+        If multiple levels passed, should contain tuples
+    levels : list of sequences, default None
+        Specific levels (unique values) to use for constructing a
+        MultiIndex. Otherwise they will be inferred from the keys
+    names : list, default None
+        Names for the levels in the resulting hierarchical index
+
+    Notes
+    -----
+    The keys, levels, and names arguments are all optional
 
     Returns
     -------
@@ -885,22 +893,24 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None):
         else:
             label_list.append(concat_index.values)
 
-        names.extend(_get_consensus_names(indexes))
+        # also copies
+        names = names + _get_consensus_names(indexes)
 
         return MultiIndex.from_arrays(label_list, names=names)
 
     new_index = indexes[0]
     n = len(new_index)
 
-    names.append(indexes[0].name)
+    # also copies
+    names = names + [indexes[0].name]
 
     if levels is None:
         if single_level:
             new_levels = [_ensure_index(keys)]
         else:
-            new_levels = [_ensure_index(k) for k in keys]
+            new_levels = [Factor(zp).level for zp in zipped]
     else:
-        new_levels = list(levels)
+        new_levels = [_ensure_index(x) for x in levels]
 
     # do something a bit more speedy
     new_levels.append(new_index)
diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py
@@ -778,7 +778,24 @@ def test_concat_with_group_keys(self):
         tm.assert_frame_equal(result, expected)
 
     def test_concat_keys_and_levels(self):
-        pass
+        df = DataFrame(np.random.randn(1, 3))
+        df2 = DataFrame(np.random.randn(1, 4))
+
+        levels = [['foo', 'baz'], ['one', 'two']]
+        names = ['first', 'second']
+        result = concat([df, df2, df, df2],
+                        keys=[('foo', 'one'), ('foo', 'two'),
+                              ('baz', 'one'), ('baz', 'two')],
+                        levels=levels,
+                        names=names)
+        expected = concat([df, df2, df, df2])
+        exp_index = MultiIndex(levels=levels + [[0]],
+                               labels=[[0, 0, 1, 1], [0, 1, 0, 1],
+                                       [0, 0, 0, 0]],
+                               names=names + [None])
+        expected.index = exp_index
+
+        assert_frame_equal(result, expected)
 
     def test_crossed_dtypes_weird_corner(self):
         columns = ['A', 'B', 'C', 'D']