From b08c1e000818e302e77c0b3b1cbf152f4d590cfc Mon Sep 17 00:00:00 2001 From: Nick Eubank Date: Thu, 12 Mar 2015 12:22:22 -0700 Subject: [PATCH] Add sample function with tests and docs --- doc/source/api.rst | 3 + doc/source/indexing.rst | 75 ++++++++++++++ doc/source/whatsnew/v0.16.1.txt | 44 +++++++- pandas/core/common.py | 28 ++++++ pandas/core/generic.py | 114 +++++++++++++++++++++ pandas/tests/test_common.py | 20 ++++ pandas/tests/test_generic.py | 172 ++++++++++++++++++++++++++++++++ 7 files changed, 455 insertions(+), 1 deletion(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 87e9b20f97e69..d442d8631247c 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -390,6 +390,7 @@ Reindexing / Selection / Label manipulation Series.reindex_like Series.rename Series.reset_index + Series.sample Series.select Series.take Series.tail @@ -824,6 +825,7 @@ Reindexing / Selection / Label manipulation DataFrame.reindex_like DataFrame.rename DataFrame.reset_index + DataFrame.sample DataFrame.select DataFrame.set_index DataFrame.tail @@ -1072,6 +1074,7 @@ Reindexing / Selection / Label manipulation Panel.reindex_axis Panel.reindex_like Panel.rename + Panel.sample Panel.select Panel.take Panel.truncate diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 1729a9d76cacd..bafc1386fd223 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -508,6 +508,81 @@ A list of indexers where any element is out of bounds will raise an .. _indexing.basics.partial_setting: +Selecting Random Samples +------------------------ +.. versionadded::0.16.1 + +A random selection of rows or columns from a Series, DataFrame, or Panel with the :meth:`~DataFrame.sample` method. The method will sample rows by default, and accepts a specific number of rows/columns to return, or a fraction of rows. + +.. ipython :: python + + s = Series([0,1,2,3,4,5]) + + # When no arguments are passed, returns 1 row. + s.sample() + + # One may specify either a number of rows: + s.sample(n=3) + + # Or a fraction of the rows: + s.sample(frac=0.5) + +By default, ``sample`` will return each row at most once, but one can also sample with replacement +using the ``replace`` option: + +.. ipython :: python + + s = Series([0,1,2,3,4,5]) + + # Without replacement (default): + s.sample(n=6, replace=False) + + # With replacement: + s.sample(n=6, replace=True) + + +By default, each row has an equal probability of being selected, but if you want rows +to have different probabilities, you can pass the ``sample`` function sampling weights as +``weights``. These weights can be a list, a numpy array, or a Series, but they must be of the same length as the object you are sampling. Missing values will be treated as a weight of zero, and inf values are not allowed. If weights do not sum to 1, they will be re-normalized by dividing all weights by the sum of the weights. For example: + +.. ipython :: python + + s = Series([0,1,2,3,4,5]) + example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4] + s.sample(n=3, weights=example_weights) + + # Weights will be re-normalized automatically + example_weights2 = [0.5, 0, 0, 0, 0, 0] + s.sample(n=1, weights=example_weights2) + +When applied to a DataFrame, you can use a column of the DataFrame as sampling weights +(provided you are sampling rows and not columns) by simply passing the name of the column +as a string. + +.. ipython :: python + + df2 = DataFrame({'col1':[9,8,7,6], 'weight_column':[0.5, 0.4, 0.1, 0]}) + df2.sample(n = 3, weights = 'weight_column') + +``sample`` also allows users to sample columns instead of rows using the ``axis`` argument. + +.. ipython :: python + + df3 = DataFrame({'col1':[1,2,3], 'col2':[2,3,4]}) + df3.sample(n=1, axis=1) + +Finally, one can also set a seed for ``sample``'s random number generator using the ``random_state`` argument, which will accept either an integer (as a seed) or a numpy RandomState object. + +.. ipython :: python + + df4 = DataFrame({'col1':[1,2,3], 'col2':[2,3,4]}) + + # With a given seed, the sample will always draw the same rows. + df4.sample(n=2, random_state=2) + df4.sample(n=2, random_state=2) + + + Setting With Enlargement ------------------------ diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index 0e85b1d7870b2..7ec5c716023fc 100755 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -12,11 +12,12 @@ Highlights include: - Support for a ``CategoricalIndex``, a category based index, see :ref:`here ` - New section on how-to-contribute to *pandas*, see :ref`here ` +- New method ``sample`` for drawing random samples from Series, DataFrames and Panels. See :ref:`here ` + .. contents:: What's new in v0.16.1 :local: :backlinks: none - .. _whatsnew_0161.enhancements: Enhancements @@ -137,6 +138,47 @@ values NOT in the categories, similarly to how you can reindex ANY pandas index. See the :ref:`documentation ` for more. (:issue:`7629`) +.. _whatsnew_0161.enhancements.sample: + +Sample +^^^^^^ + +Series, DataFrames, and Panels now have a new method: :meth:`~pandas.DataFrame.sample`. +The method accepts a specific number of rows or columns to return, or a fraction of the +total number or rows or columns. It also has options for sampling with or without replacement, +for passing in a column for weights for non-uniform sampling, and for setting seed values to facilitate replication. + +.. ipython :: python + + example_series = Series([0,1,2,3,4,5]) + + # When no arguments are passed, returns 1 + example_series.sample() + + # One may specify either a number of rows: + example_series.sample(n=3) + + # Or a fraction of the rows: + example_series.sample(frac=0.5) + + # weights are accepted. + example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4] + example_series.sample(n=3, weights=example_weights) + + # weights will also be normalized if they do not sum to one, + # and missing values will be treated as zeros. + example_weights2 = [0.5, 0, 0, 0, None, np.nan] + example_series.sample(n=1, weights=example_weights2) + + +When applied to a DataFrame, one may pass the name of a column to specify sampling weights +when sampling from rows. + +.. ipython :: python + + df = DataFrame({'col1':[9,8,7,6], 'weight_column':[0.5, 0.4, 0.1, 0]}) + df.sample(n=3, weights='weight_column') + .. _whatsnew_0161.api: API changes diff --git a/pandas/core/common.py b/pandas/core/common.py index 3d23aeff942dc..cb8b19bb79720 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -3319,3 +3319,31 @@ def _maybe_match_name(a, b): if a_name == b_name: return a_name return None + +def _random_state(state=None): + """ + Helper function for processing random_state arguments. + + Parameters + ---------- + state : int, np.random.RandomState, None. + If receives an int, passes to np.random.RandomState() as seed. + If receives an np.random.RandomState object, just returns object. + If receives `None`, returns an np.random.RandomState object. + If receives anything else, raises an informative ValueError. + Default None. + + Returns + ------- + np.random.RandomState + """ + + if is_integer(state): + return np.random.RandomState(state) + elif isinstance(state, np.random.RandomState): + return state + elif state is None: + return np.random.RandomState() + else: + raise ValueError("random_state must be an integer, a numpy RandomState, or None") + diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bb5256f58795a..0d17420d821f7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1948,6 +1948,120 @@ def tail(self, n=5): return self return self.iloc[-n:] + + def sample(self, n=None, frac=None, replace=False, weights=None, random_state=None, axis=None): + """ + Returns a random sample of items from an axis of object. + + Parameters + ---------- + n : int, optional + Number of items from axis to return. Cannot be used with `frac`. + Default = 1 if `frac` = None. + frac : float, optional + Fraction of axis items to return. Cannot be used with `n`. + replace : boolean, optional + Sample with or without replacement. Default = False. + weights : str or ndarray-like, optional + Default 'None' results in equal probability weighting. + If called on a DataFrame, will accept the name of a column + when axis = 0. + Weights must be same length as axis being sampled. + If weights do not sum to 1, they will be normalized to sum to 1. + Missing values in the weights column will be treated as zero. + inf and -inf values not allowed. + random_state : int or numpy.random.RandomState, optional + Seed for the random number generator (if int), or numpy RandomState + object. + axis : int or string, optional + Axis to sample. Accepts axis number or name. Default is stat axis + for given data type (0 for Series and DataFrames, 1 for Panels). + + Returns + ------- + Same type as caller. + """ + + ### + # Process axis argument + ### + + if axis is None: + axis = self._stat_axis_number + + axis = self._get_axis_number(axis) + + axis_length = self.shape[axis] + + ### + # Process random_state argument + ### + + rs = com._random_state(random_state) + + ### + # Process weights + ### + + # Check weights for compliance + if weights is not None: + + # Strings acceptable if a dataframe and axis = 0 + if isinstance(weights, string_types): + if isinstance(self, pd.DataFrame): + if axis == 0: + try: + weights = self[weights] + except KeyError: + raise KeyError("String passed to weights not a valid column") + else: + raise ValueError("Strings can only be passed to weights when sampling from rows on a DataFrame") + else: + raise ValueError("Strings cannot be passed as weights when sampling from a Series or Panel.") + + #normalize format of weights to Series. + weights = pd.Series(weights, dtype='float64') + + if len(weights) != axis_length: + raise ValueError("Weights and axis to be sampled must be of same length") + + if (weights == np.inf).any() or (weights == -np.inf).any(): + raise ValueError("weight vector may not include `inf` values") + + if (weights < 0).any(): + raise ValueError("weight vector many not include negative values") + + # If has nan, set to zero. + weights = weights.fillna(0) + + # Renormalize if don't sum to 1 + if weights.sum() != 1: + weights = weights / weights.sum() + + weights = weights.values + + ### + # Process n and frac arguments + ### + + # If no frac or n, default to n=1. + if n is None and frac is None: + n = 1 + elif n is not None and frac is None and n % 1 != 0: + raise ValueError("Only integers accepted as `n` values") + elif n is None and frac is not None: + n = int(round(frac * axis_length)) + elif n is not None and frac is not None: + raise ValueError('Please enter a value for `frac` OR `n`, not both') + + # Check for negative sizes + if n < 0: + raise ValueError("A negative number of rows requested. Please provide positive value.") + + locs = rs.choice(axis_length, size=n, replace=replace, p=weights) + return self.take(locs, axis=axis) + + #---------------------------------------------------------------------- # Attribute access diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index d0ae7c9988c8d..f1c988e3f0323 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -524,6 +524,26 @@ def test_is_recompilable(): for f in fails: assert not com.is_re_compilable(f) +def test_random_state(): + import numpy.random as npr + # Check with seed + state = com._random_state(5) + assert_equal(state.uniform(), npr.RandomState(5).uniform()) + + # Check with random state object + state2 = npr.RandomState(10) + assert_equal(com._random_state(state2).uniform(), npr.RandomState(10).uniform()) + + # check with no arg random state + assert isinstance(com._random_state(), npr.RandomState) + + # Error for floats or strings + with tm.assertRaises(ValueError): + com._random_state('test') + + with tm.assertRaises(ValueError): + com._random_state(5.5) + class TestTake(tm.TestCase): # standard incompatible fill error diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 3dd8c2594cd46..5a1bc99593fca 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -354,6 +354,178 @@ def test_head_tail(self): self._compare(o.head(-3), o.head(7)) self._compare(o.tail(-3), o.tail(7)) + def test_sample(self): + # Fixes issue: 2419 + + o = self._construct(shape=10) + + ### + # Check behavior of random_state argument + ### + + # Check for stability when receives seed or random state -- run 10 times. + for test in range(10): + seed = np.random.randint(0,100) + self._compare(o.sample(n=4, random_state=seed), o.sample(n=4, random_state=seed)) + self._compare(o.sample(frac=0.7,random_state=seed), o.sample(frac=0.7, random_state=seed)) + + self._compare(o.sample(n=4, random_state=np.random.RandomState(test)), + o.sample(n=4, random_state=np.random.RandomState(test))) + + self._compare(o.sample(frac=0.7,random_state=np.random.RandomState(test)), + o.sample(frac=0.7, random_state=np.random.RandomState(test))) + + + # Check for error when random_state argument invalid. + with tm.assertRaises(ValueError): + o.sample(random_state='astring!') + + ### + # Check behavior of `frac` and `N` + ### + + # Giving both frac and N throws error + with tm.assertRaises(ValueError): + o.sample(n=3, frac=0.3) + + # Check that raises right error for negative lengths + with tm.assertRaises(ValueError): + o.sample(n=-3) + with tm.assertRaises(ValueError): + o.sample(frac=-0.3) + + # Make sure float values of `n` give error + with tm.assertRaises(ValueError): + o.sample(n= 3.2) + + # Check lengths are right + self.assertTrue(len(o.sample(n=4) == 4)) + self.assertTrue(len(o.sample(frac=0.34) == 3)) + self.assertTrue(len(o.sample(frac=0.36) == 4)) + + ### + # Check weights + ### + + # Weight length must be right + with tm.assertRaises(ValueError): + o.sample(n=3, weights=[0,1]) + + with tm.assertRaises(ValueError): + bad_weights = [0.5]*11 + o.sample(n=3, weights=bad_weights) + + # Check won't accept negative weights + with tm.assertRaises(ValueError): + bad_weights = [-0.1]*10 + o.sample(n=3, weights=bad_weights) + + # Check inf and -inf throw errors: + with tm.assertRaises(ValueError): + weights_with_inf = [0.1]*10 + weights_with_inf[0] = np.inf + o.sample(n=3, weights=weights_with_inf) + + with tm.assertRaises(ValueError): + weights_with_ninf = [0.1]*10 + weights_with_ninf[0] = -np.inf + o.sample(n=3, weights=weights_with_ninf) + + + # A few dataframe test with degenerate weights. + easy_weight_list = [0]*10 + easy_weight_list[5] = 1 + + df = pd.DataFrame({'col1':range(10,20), + 'col2':range(20,30), + 'colString': ['a']*10, + 'easyweights':easy_weight_list}) + sample1 = df.sample(n=1, weights='easyweights') + assert_frame_equal(sample1, df.iloc[5:6]) + + # Ensure proper error if string given as weight for Series, panel, or + # DataFrame with axis = 1. + s = Series(range(10)) + with tm.assertRaises(ValueError): + s.sample(n=3, weights='weight_column') + + panel = pd.Panel(items = [0,1,2], major_axis = [2,3,4], minor_axis = [3,4,5]) + with tm.assertRaises(ValueError): + panel.sample(n=1, weights='weight_column') + + with tm.assertRaises(ValueError): + df.sample(n=1, weights='weight_column', axis = 1) + + # Check weighting key error + with tm.assertRaises(KeyError): + df.sample(n=3, weights='not_a_real_column_name') + + # Check np.nan are replaced by zeros. + weights_with_nan = [np.nan]*10 + weights_with_nan[5] = 0.5 + self._compare(o.sample(n=1, axis=0, weights=weights_with_nan), o.iloc[5:6]) + + # Check None are also replaced by zeros. + weights_with_None = [None]*10 + weights_with_None[5] = 0.5 + self._compare(o.sample(n=1, axis=0, weights=weights_with_None), o.iloc[5:6]) + + # Check that re-normalizes weights that don't sum to one. + weights_less_than_1 = [0]*10 + weights_less_than_1[0] = 0.5 + tm.assert_frame_equal(df.sample(n=1, weights=weights_less_than_1), df.iloc[:1]) + + + ### + # Test axis argument + ### + + # Test axis argument + df = pd.DataFrame({'col1':range(10), 'col2':['a']*10}) + second_column_weight = [0,1] + assert_frame_equal(df.sample(n=1, axis=1, weights=second_column_weight), df[['col2']]) + + # Different axis arg types + assert_frame_equal(df.sample(n=1, axis='columns', weights=second_column_weight), + df[['col2']]) + + weight = [0]*10 + weight[5] = 0.5 + assert_frame_equal(df.sample(n=1, axis='rows', weights=weight), + df.iloc[5:6]) + assert_frame_equal(df.sample(n=1, axis='index', weights=weight), + df.iloc[5:6]) + + + # Check out of range axis values + with tm.assertRaises(ValueError): + df.sample(n=1, axis=2) + + with tm.assertRaises(ValueError): + df.sample(n=1, axis='not_a_name') + + with tm.assertRaises(ValueError): + s = pd.Series(range(10)) + s.sample(n=1, axis=1) + + # Test weight length compared to correct axis + with tm.assertRaises(ValueError): + df.sample(n=1, axis=1, weights=[0.5]*10) + + # Check weights with axis = 1 + easy_weight_list = [0]*3 + easy_weight_list[2] = 1 + + df = pd.DataFrame({'col1':range(10,20), + 'col2':range(20,30), + 'colString': ['a']*10}) + sample1 = df.sample(n=1, axis=1, weights=easy_weight_list) + assert_frame_equal(sample1, df[['colString']]) + + # Test default axes + p = pd.Panel(items = ['a','b','c'], major_axis=[2,4,6], minor_axis=[1,3,5]) + assert_panel_equal(p.sample(n=3, random_state=42), p.sample(n=3, axis=1, random_state=42)) + assert_frame_equal(df.sample(n=3, random_state=42), df.sample(n=3, axis=0, random_state=42)) def test_size_compat(self): # GH8846