From b08c1e000818e302e77c0b3b1cbf152f4d590cfc Mon Sep 17 00:00:00 2001
From: Nick Eubank <nickeubank@users.noreply.github.com>
Date: Thu, 12 Mar 2015 12:22:22 -0700
Subject: [PATCH] Add sample function with tests and docs

---
 doc/source/api.rst              |   3 +
 doc/source/indexing.rst         |  75 ++++++++++++++
 doc/source/whatsnew/v0.16.1.txt |  44 +++++++-
 pandas/core/common.py           |  28 ++++++
 pandas/core/generic.py          | 114 +++++++++++++++++++++
 pandas/tests/test_common.py     |  20 ++++
 pandas/tests/test_generic.py    | 172 ++++++++++++++++++++++++++++++++
 7 files changed, 455 insertions(+), 1 deletion(-)

diff --git a/doc/source/api.rst b/doc/source/api.rst
index 87e9b20f97e69..d442d8631247c 100644
--- a/doc/source/api.rst
+++ b/doc/source/api.rst
@@ -390,6 +390,7 @@ Reindexing / Selection / Label manipulation
    Series.reindex_like
    Series.rename
    Series.reset_index
+   Series.sample
    Series.select
    Series.take
    Series.tail
@@ -824,6 +825,7 @@ Reindexing / Selection / Label manipulation
    DataFrame.reindex_like
    DataFrame.rename
    DataFrame.reset_index
+   DataFrame.sample
    DataFrame.select
    DataFrame.set_index
    DataFrame.tail
@@ -1072,6 +1074,7 @@ Reindexing / Selection / Label manipulation
    Panel.reindex_axis
    Panel.reindex_like
    Panel.rename
+   Panel.sample
    Panel.select
    Panel.take
    Panel.truncate
diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst
index 1729a9d76cacd..bafc1386fd223 100644
--- a/doc/source/indexing.rst
+++ b/doc/source/indexing.rst
@@ -508,6 +508,81 @@ A list of indexers where any element is out of bounds will raise an
 
 .. _indexing.basics.partial_setting:
 
+Selecting Random Samples
+------------------------
+.. versionadded::0.16.1
+
+A random selection of rows or columns from a Series, DataFrame, or Panel with the :meth:`~DataFrame.sample` method. The method will sample rows by default, and accepts a specific number of rows/columns to return, or a fraction of rows. 
+
+.. ipython :: python
+
+    s = Series([0,1,2,3,4,5])
+
+    # When no arguments are passed, returns 1 row.
+    s.sample()
+    
+    # One may specify either a number of rows:
+    s.sample(n=3)
+   
+    # Or a fraction of the rows:
+    s.sample(frac=0.5)
+
+By default, ``sample`` will return each row at most once, but one can also sample with replacement
+using the ``replace`` option:
+
+.. ipython :: python
+
+   s = Series([0,1,2,3,4,5])
+
+    # Without replacement (default):
+    s.sample(n=6, replace=False)
+
+    # With replacement:
+    s.sample(n=6, replace=True)
+
+
+By default, each row has an equal probability of being selected, but if you want rows
+to have different probabilities, you can pass the ``sample`` function sampling weights as 
+``weights``. These weights can be a list, a numpy array, or a Series, but they must be of the same length as the object you are sampling. Missing values will be treated as a weight of zero, and inf values are not allowed. If weights do not sum to 1, they will be re-normalized by dividing all weights by the sum of the weights. For example:
+
+.. ipython :: python
+
+    s = Series([0,1,2,3,4,5])
+    example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4]
+    s.sample(n=3, weights=example_weights)
+    
+    # Weights will be re-normalized automatically
+    example_weights2 = [0.5, 0, 0, 0, 0, 0]
+    s.sample(n=1, weights=example_weights2)
+
+When applied to a DataFrame, you can use a column of the DataFrame as sampling weights
+(provided you are sampling rows and not columns) by simply passing the name of the column 
+as a string.
+    
+.. ipython :: python
+
+    df2 = DataFrame({'col1':[9,8,7,6], 'weight_column':[0.5, 0.4, 0.1, 0]})
+    df2.sample(n = 3, weights = 'weight_column')
+
+``sample`` also allows users to sample columns instead of rows using the ``axis`` argument. 
+
+.. 	ipython :: python
+
+    df3 = DataFrame({'col1':[1,2,3], 'col2':[2,3,4]})
+    df3.sample(n=1, axis=1)
+
+Finally, one can also set a seed for ``sample``'s random number generator using the ``random_state`` argument, which will accept either an integer (as a seed) or a numpy RandomState object. 
+
+.. 	ipython :: python
+
+    df4 = DataFrame({'col1':[1,2,3], 'col2':[2,3,4]})
+
+    # With a given seed, the sample will always draw the same rows. 
+    df4.sample(n=2, random_state=2)
+    df4.sample(n=2, random_state=2)
+
+
+
 Setting With Enlargement
 ------------------------
 
diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt
index 0e85b1d7870b2..7ec5c716023fc 100755
--- a/doc/source/whatsnew/v0.16.1.txt
+++ b/doc/source/whatsnew/v0.16.1.txt
@@ -12,11 +12,12 @@ Highlights include:
 - Support for a ``CategoricalIndex``, a category based index, see :ref:`here <whatsnew_0161.enhancements.categoricalindex>`
 - New section on how-to-contribute to *pandas*, see :ref`here <contributing>`
 
+- New method ``sample`` for drawing random samples from Series, DataFrames and Panels. See :ref:`here <whatsnew_0161.enchancements.sample>` 
+
 .. contents:: What's new in v0.16.1
     :local:
     :backlinks: none
 
-
 .. _whatsnew_0161.enhancements:
 
 Enhancements
@@ -137,6 +138,47 @@ values NOT in the categories, similarly to how you can reindex ANY pandas index.
 
 See the :ref:`documentation <advanced.categoricalindex>` for more. (:issue:`7629`)
 
+.. _whatsnew_0161.enhancements.sample:
+
+Sample
+^^^^^^
+
+Series, DataFrames, and Panels now have a new method: :meth:`~pandas.DataFrame.sample`.
+The method accepts a specific number of rows or columns to return, or a fraction of the 
+total number or rows or columns. It also has options for sampling with or without replacement, 
+for passing in a column for weights for non-uniform sampling, and for setting seed values to facilitate replication. 
+
+.. ipython :: python
+
+   example_series = Series([0,1,2,3,4,5])
+
+   # When no arguments are passed, returns 1
+   example_series.sample()
+   
+   # One may specify either a number of rows:
+   example_series.sample(n=3)
+   
+   # Or a fraction of the rows:
+   example_series.sample(frac=0.5)
+
+   # weights are accepted. 
+   example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4]
+   example_series.sample(n=3, weights=example_weights)
+
+   # weights will also be normalized if they do not sum to one, 
+   # and missing values will be treated as zeros. 
+   example_weights2 = [0.5, 0, 0, 0, None, np.nan]
+   example_series.sample(n=1, weights=example_weights2)
+
+
+When applied to a DataFrame, one may pass the name of a column to specify sampling weights
+when sampling from rows. 
+	
+.. ipython :: python
+
+   df = DataFrame({'col1':[9,8,7,6], 'weight_column':[0.5, 0.4, 0.1, 0]})
+   df.sample(n=3, weights='weight_column')
+
 .. _whatsnew_0161.api:
 
 API changes
diff --git a/pandas/core/common.py b/pandas/core/common.py
index 3d23aeff942dc..cb8b19bb79720 100644
--- a/pandas/core/common.py
+++ b/pandas/core/common.py
@@ -3319,3 +3319,31 @@ def _maybe_match_name(a, b):
     if a_name == b_name:
         return a_name
     return None
+
+def _random_state(state=None):
+    """
+    Helper function for processing random_state arguments. 
+ 
+    Parameters
+    ----------
+    state : int, np.random.RandomState, None. 
+        If receives an int, passes to np.random.RandomState() as seed.
+        If receives an np.random.RandomState object, just returns object.
+        If receives `None`, returns an np.random.RandomState object. 
+        If receives anything else, raises an informative ValueError.
+        Default None.
+        
+    Returns
+    -------
+    np.random.RandomState
+    """
+
+    if is_integer(state):
+        return np.random.RandomState(state)
+    elif isinstance(state, np.random.RandomState):
+        return state
+    elif state is None:
+        return np.random.RandomState()
+    else:
+        raise ValueError("random_state must be an integer, a numpy RandomState, or None")
+
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index bb5256f58795a..0d17420d821f7 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -1948,6 +1948,120 @@ def tail(self, n=5):
             return self
         return self.iloc[-n:]
 
+  
+    def sample(self, n=None, frac=None, replace=False, weights=None, random_state=None, axis=None):
+        """
+        Returns a random sample of items from an axis of object. 
+        
+        Parameters
+        ----------
+        n : int, optional
+            Number of items from axis to return. Cannot be used with `frac`.
+            Default = 1 if `frac` = None. 
+        frac : float, optional
+            Fraction of axis items to return. Cannot be used with `n`. 
+        replace : boolean, optional
+            Sample with or without replacement. Default = False. 
+        weights : str or ndarray-like, optional
+            Default 'None' results in equal probability weighting. 
+            If called on a DataFrame, will accept the name of a column
+            when axis = 0. 
+            Weights must be same length as axis being sampled. 
+            If weights do not sum to 1, they will be normalized to sum to 1. 
+            Missing values in the weights column will be treated as zero. 
+            inf and -inf values not allowed. 
+        random_state : int or numpy.random.RandomState, optional
+            Seed for the random number generator (if int), or numpy RandomState 
+            object.
+        axis : int or string, optional
+            Axis to sample. Accepts axis number or name. Default is stat axis
+            for given data type (0 for Series and DataFrames, 1 for Panels).
+            
+        Returns
+        -------
+        Same type as caller. 
+        """
+        
+        ###        
+        # Process axis argument
+        ###
+        
+        if axis is None:
+            axis = self._stat_axis_number
+        
+        axis = self._get_axis_number(axis)
+                
+        axis_length = self.shape[axis]    
+    
+        ###
+        # Process random_state argument
+        ###
+     
+        rs = com._random_state(random_state)
+    
+        ###
+        #  Process weights 
+        ###
+    
+        # Check weights for compliance
+        if weights is not None:
+                                      
+            # Strings acceptable if a dataframe and axis = 0
+            if isinstance(weights, string_types): 
+                if isinstance(self, pd.DataFrame):
+                    if axis == 0:
+                        try:
+                            weights = self[weights]
+                        except KeyError:
+                            raise KeyError("String passed to weights not a valid column")
+                    else: 
+                        raise ValueError("Strings can only be passed to weights when sampling from rows on a DataFrame")
+                else:
+                    raise ValueError("Strings cannot be passed as weights when sampling from a Series or Panel.")
+
+            #normalize format of weights to Series. 
+            weights = pd.Series(weights, dtype='float64')
+                        
+            if len(weights) != axis_length:
+                raise ValueError("Weights and axis to be sampled must be of same length")
+
+            if (weights == np.inf).any() or (weights == -np.inf).any():
+                raise ValueError("weight vector may not include `inf` values")
+                
+            if (weights < 0).any():
+                raise ValueError("weight vector many not include negative values")
+
+            # If has nan, set to zero.
+            weights = weights.fillna(0)
+
+            # Renormalize if don't sum to 1
+            if weights.sum() != 1:
+                weights = weights / weights.sum()
+                
+            weights = weights.values
+    
+        ###
+        # Process n and frac arguments
+        ###
+    
+        # If no frac or n, default to n=1.
+        if n is None and frac is None:
+            n = 1
+        elif n is not None and frac is None and n % 1 != 0:
+            raise ValueError("Only integers accepted as `n` values")
+        elif n is None and frac is not None:
+            n = int(round(frac * axis_length)) 
+        elif n is not None and frac is not None:
+            raise ValueError('Please enter a value for `frac` OR `n`, not both')
+    
+        # Check for negative sizes
+        if n < 0:
+            raise ValueError("A negative number of rows requested. Please provide positive value.")
+    
+        locs = rs.choice(axis_length, size=n, replace=replace, p=weights)
+        return self.take(locs, axis=axis)
+
+    
     #----------------------------------------------------------------------
     # Attribute access
 
diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py
index d0ae7c9988c8d..f1c988e3f0323 100644
--- a/pandas/tests/test_common.py
+++ b/pandas/tests/test_common.py
@@ -524,6 +524,26 @@ def test_is_recompilable():
     for f in fails:
         assert not com.is_re_compilable(f)
 
+def test_random_state():
+    import numpy.random as npr
+    # Check with seed
+    state = com._random_state(5)
+    assert_equal(state.uniform(), npr.RandomState(5).uniform())
+    
+    # Check with random state object
+    state2 = npr.RandomState(10)
+    assert_equal(com._random_state(state2).uniform(), npr.RandomState(10).uniform())    
+    
+    # check with no arg random state
+    assert isinstance(com._random_state(), npr.RandomState)
+    
+    # Error for floats or strings
+    with tm.assertRaises(ValueError):
+        com._random_state('test')
+
+    with tm.assertRaises(ValueError):
+        com._random_state(5.5)
+
 
 class TestTake(tm.TestCase):
     # standard incompatible fill error
diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py
index 3dd8c2594cd46..5a1bc99593fca 100644
--- a/pandas/tests/test_generic.py
+++ b/pandas/tests/test_generic.py
@@ -354,6 +354,178 @@ def test_head_tail(self):
             self._compare(o.head(-3), o.head(7))
             self._compare(o.tail(-3), o.tail(7))
 
+    def test_sample(self):
+        # Fixes issue: 2419
+    
+        o = self._construct(shape=10)
+        
+        ###
+        # Check behavior of random_state argument
+        ###
+
+        # Check for stability when receives seed or random state -- run 10 times. 
+        for test in range(10):
+            seed = np.random.randint(0,100)
+            self._compare(o.sample(n=4, random_state=seed), o.sample(n=4, random_state=seed))
+            self._compare(o.sample(frac=0.7,random_state=seed), o.sample(frac=0.7, random_state=seed))        
+
+            self._compare(o.sample(n=4, random_state=np.random.RandomState(test)), 
+                          o.sample(n=4, random_state=np.random.RandomState(test)))
+                          
+            self._compare(o.sample(frac=0.7,random_state=np.random.RandomState(test)), 
+                          o.sample(frac=0.7, random_state=np.random.RandomState(test)))        
+            
+            
+        # Check for error when random_state argument invalid. 
+        with tm.assertRaises(ValueError):
+            o.sample(random_state='astring!')
+        
+        ###
+        # Check behavior of `frac` and `N`        
+        ###
+        
+        # Giving both frac and N throws error
+        with tm.assertRaises(ValueError):
+            o.sample(n=3, frac=0.3)
+
+        # Check that raises right error for negative lengths
+        with tm.assertRaises(ValueError):
+            o.sample(n=-3)
+        with tm.assertRaises(ValueError):
+            o.sample(frac=-0.3)
+            
+        # Make sure float values of `n` give error
+        with tm.assertRaises(ValueError):
+            o.sample(n= 3.2)
+
+        # Check lengths are right
+        self.assertTrue(len(o.sample(n=4) == 4))
+        self.assertTrue(len(o.sample(frac=0.34) == 3))
+        self.assertTrue(len(o.sample(frac=0.36) == 4))
+
+        ###
+        # Check weights
+        ###
+
+        # Weight length must be right            
+        with tm.assertRaises(ValueError):
+            o.sample(n=3, weights=[0,1])
+        
+        with tm.assertRaises(ValueError):
+            bad_weights = [0.5]*11
+            o.sample(n=3, weights=bad_weights)
+
+        # Check won't accept negative weights
+        with tm.assertRaises(ValueError):
+            bad_weights = [-0.1]*10
+            o.sample(n=3, weights=bad_weights)
+
+        # Check inf and -inf throw errors:
+        with tm.assertRaises(ValueError):
+            weights_with_inf = [0.1]*10
+            weights_with_inf[0] = np.inf
+            o.sample(n=3, weights=weights_with_inf)
+
+        with tm.assertRaises(ValueError):
+            weights_with_ninf = [0.1]*10
+            weights_with_ninf[0] =  -np.inf
+            o.sample(n=3, weights=weights_with_ninf)
+        
+
+        # A few dataframe test with degenerate weights. 
+        easy_weight_list = [0]*10
+        easy_weight_list[5] = 1
+        
+        df = pd.DataFrame({'col1':range(10,20), 
+                           'col2':range(20,30), 
+                           'colString': ['a']*10,
+                           'easyweights':easy_weight_list})    
+        sample1 = df.sample(n=1, weights='easyweights') 
+        assert_frame_equal(sample1, df.iloc[5:6])
+
+        # Ensure proper error if string given as weight for Series, panel, or 
+        # DataFrame with axis = 1.
+        s = Series(range(10))
+        with tm.assertRaises(ValueError):
+            s.sample(n=3, weights='weight_column')
+
+        panel = pd.Panel(items = [0,1,2], major_axis = [2,3,4], minor_axis = [3,4,5])
+        with tm.assertRaises(ValueError):
+            panel.sample(n=1, weights='weight_column')
+
+        with tm.assertRaises(ValueError):
+            df.sample(n=1, weights='weight_column', axis = 1)
+
+        # Check weighting key error        
+        with tm.assertRaises(KeyError):
+            df.sample(n=3, weights='not_a_real_column_name')
+ 
+         # Check np.nan are replaced by zeros. 
+        weights_with_nan = [np.nan]*10
+        weights_with_nan[5] = 0.5
+        self._compare(o.sample(n=1, axis=0, weights=weights_with_nan), o.iloc[5:6])
+    
+        # Check None are also replaced by zeros. 
+        weights_with_None = [None]*10        
+        weights_with_None[5] = 0.5
+        self._compare(o.sample(n=1, axis=0, weights=weights_with_None), o.iloc[5:6])
+
+        # Check that re-normalizes weights that don't sum to one.
+        weights_less_than_1 = [0]*10
+        weights_less_than_1[0] = 0.5
+        tm.assert_frame_equal(df.sample(n=1, weights=weights_less_than_1), df.iloc[:1])      
+         
+ 
+        ###
+        # Test axis argument
+        ###
+
+        # Test axis argument
+        df = pd.DataFrame({'col1':range(10), 'col2':['a']*10})
+        second_column_weight = [0,1]
+        assert_frame_equal(df.sample(n=1, axis=1, weights=second_column_weight), df[['col2']])
+
+        # Different axis arg types
+        assert_frame_equal(df.sample(n=1, axis='columns', weights=second_column_weight), 
+                           df[['col2']])
+
+        weight = [0]*10
+        weight[5] = 0.5
+        assert_frame_equal(df.sample(n=1, axis='rows', weights=weight), 
+                           df.iloc[5:6])
+        assert_frame_equal(df.sample(n=1, axis='index', weights=weight), 
+                           df.iloc[5:6])
+
+
+        # Check out of range axis values
+        with tm.assertRaises(ValueError):
+            df.sample(n=1, axis=2)
+
+        with tm.assertRaises(ValueError):
+            df.sample(n=1, axis='not_a_name')
+
+        with tm.assertRaises(ValueError):
+            s = pd.Series(range(10))            
+            s.sample(n=1, axis=1)
+
+        # Test weight length compared to correct axis
+        with tm.assertRaises(ValueError):
+            df.sample(n=1, axis=1, weights=[0.5]*10)
+
+        # Check weights with axis = 1
+        easy_weight_list = [0]*3
+        easy_weight_list[2] = 1
+        
+        df = pd.DataFrame({'col1':range(10,20), 
+                           'col2':range(20,30), 
+                           'colString': ['a']*10})    
+        sample1 = df.sample(n=1, axis=1, weights=easy_weight_list) 
+        assert_frame_equal(sample1, df[['colString']])        
+        
+        # Test default axes
+        p = pd.Panel(items = ['a','b','c'], major_axis=[2,4,6], minor_axis=[1,3,5])        
+        assert_panel_equal(p.sample(n=3, random_state=42), p.sample(n=3, axis=1, random_state=42))
+        assert_frame_equal(df.sample(n=3, random_state=42), df.sample(n=3, axis=0, random_state=42))
 
     def test_size_compat(self):
         # GH8846