From c82c7f7949cc44139e9733e0242f758a46248569 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 19 Feb 2020 11:37:38 -0500 Subject: [PATCH 01/20] DOC Adds pandas SLEP --- slep013/proposal.rst | 173 +++++++++++++++++++++++++++++++++++++++++++ under_review.rst | 1 + 2 files changed, 174 insertions(+) create mode 100644 slep013/proposal.rst diff --git a/slep013/proposal.rst b/slep013/proposal.rst new file mode 100644 index 0000000..934afd9 --- /dev/null +++ b/slep013/proposal.rst @@ -0,0 +1,173 @@ +.. _slep_013: + +============================== +SLEP013: Pandas In, Pandas Out +============================== + +:Author: Thomas J Fan +:Status: Under Review +:Type: Standards Track +:Created: 2020-02-18 + +Abstract +######## + +This SLEP proposes using pandas DataFrames for propagating feature names +through ``scikit-learn`` estimators. + +Motivation +########## + +``scikit-learn`` is generally used as a part of a larger data processing +pipeline. When this pipeline is used to transform data, the result is a +NumPy array, which discards column names. The current workflow for +extracting the feature names requires calling ``get_feature_names`` on the +transformer that created the feature. This interface can be cumbersome when used +together with a pipeline with multiple column names:: + + import pandas as pd + import numpy as np + from sklearn.compose import make_column_transformer + from sklearn.preprocessing import OneHotEncoder, StandardScaler + from sklearn.pipeline import make_pipeline + from sklearn.linear_model import LogisticRegression + + X = pd.DataFrame({'letter': ['a', 'b', 'c'], + 'pet': ['dog', 'snake', 'dog'], + 'num': [1, 2, 3]}) + y = [0, 0, 1] + orig_cat_cols, orig_num_cols = ['letter', 'pet'], ['num'] + + ct = make_column_transformer( + (OneHotEncoder(), orig_cat_cols), (StandardScaler(), orig_num_cols)) + pipe = make_pipeline(ct, LogisticRegression()).fit(X,y) + + cat_names = (pipe['columntransformer'] + .named_transformers_['onehotencoder'] + .get_feature_names(orig_cat_cols)) + + feature_names = np.r_[cat_names, orig_num_cols] + +The ``feature_names`` extracted above corresponds to the features directly +passed into ``LogisticRegression``. As demonstrated above, the process of +extracting ``feature_names`` requires knowing the order of the selected +categories in the ``ColumnTransformer``. Furthemore, if there is feature +selection in the pipeline, such as ``SelectKBest``, the ``get_support`` method +would need to be used to select column names that were selected through. + +Solution +######## + +The pandas ``DataFrame`` has been widely adopted by the Python Data ecosystem to +store data with feature names. This SLEP proposes using a ``DataFrame`` to +track the feature names as the data is transformed. With this feature, the +API for extracting feature names would be:: + + from sklearn import set_config + set_config(pandas_inout=True) + + pipe.fit(X, y) + X_trans = pipe[:-1].transform(X) + + print(X_trans.columns.tolist() + ['letter_a', 'letter_b', 'letter_c', 'pet_dog', 'pet_snake', 'num'] + +Enabling Functionality +###################### + +The following enhancements are **not** a part of this SLEP. These features are +made possible if this SLEP gets accepted. + +1. Allows estimators to treat columns differently based on name or dtype. For + example, the categorical dtype is useful for tree building algorithms. + +2. Storing feature names inside estimators for model inspection:: + + from sklearn import set_config + set_config(store_feature_names_in=True) + + pipe.fit(X, y) + + pipe['logisticregression'].feature_names_in_ + +3. Allow for extracting the feature names of estimators in meta-estimators:: + + from sklearn import set_config + set_config(store_feature_names_in=True) + + est = BaggingClassifier(LogisticRegression()) + est.fit(X, y) + + # Gets the feature names used by an estimator in the ensemble + est.estimators_[0].feature_names_in_ + +Considerations +############## + +Index alignment +--------------- + +Operations are index aligned when working with ``DataFrames``. Interally, +``scikit-learn`` will ignore the alignment by operating on the ndarray as +suggested by `TomAugspurger `_:: + + def transform(self, X, y=None): + X, row_labels, input_type = check_array(X) + # X is a ndarray + result = ... + # some hypothetical function that recreates a DataFrame / DataArray, + # preserving row labels, attaching new features names. + return construct_result(result, output_feature_names, row_labels, input_type) + +Memory copies +------------- + +As noted in `pandas #27211 `_, +there is not a guarantee that there is a zero-copy round-trip going from numpy +to a ``DataFrame``. In other words, the following may lead to a memory copy in +a future version of ``pandas``:: + + X = np.array(...) + X_df = pd.DataFrame(X) + X_again = np.asarray(X_df) + +This is an issue for ``scikit-learn`` when estimators are placed into a +pipeline. For example, consider the following pipeline:: + + set_config(pandas_inout=True) + pipe = make_pipeline(StandardScaler(), LogisticRegression()) + pipe.fit(X, y) + +Interally, ``StandardScaler.fit_transform`` will operate on a ndarray and +wrap the ndarray into a ``DataFrame`` as a return value. This is will be +piped into ``LogisticRegression.fit`` which calls ``check_array`` on the +``DataFrame``, which may lead to a memory copy in a future version of +``pandas``. This leads to unnecessary overhead from piping the data from one +estimator to another. + +Backward compatibility +###################### + +The ``set_config(pandas_inout=True)`` global configuration flag will be set to +``False`` by default to ensure backward compatibility. When this flag is False, +the output of all estimators will be a ndarray. + +Alternatives +############ + +- :ref:`SLEP012 Custom InputArray Data Structure ` + +References and Footnotes +------------------------ + +.. [1] Each SLEP must either be explicitly labeled as placed in the public + domain (see this SLEP as an example) or licensed under the `Open + Publication License`_. + +.. _Open Publication License: https://www.opencontent.org/openpub/ + + +Copyright +--------- + +This document has been placed in the public domain. [1]_ diff --git a/under_review.rst b/under_review.rst index ff52d4e..44cfc56 100644 --- a/under_review.rst +++ b/under_review.rst @@ -10,3 +10,4 @@ SLEPs under review slep007/proposal slep012/proposal + slep013/proposal From 40e4831101877f7ae0b8cb5042a26791b1f4b06d Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 19 Feb 2020 11:44:13 -0500 Subject: [PATCH 02/20] FIX Grammer --- slep013/proposal.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slep013/proposal.rst b/slep013/proposal.rst index 934afd9..7243616 100644 --- a/slep013/proposal.rst +++ b/slep013/proposal.rst @@ -53,7 +53,7 @@ passed into ``LogisticRegression``. As demonstrated above, the process of extracting ``feature_names`` requires knowing the order of the selected categories in the ``ColumnTransformer``. Furthemore, if there is feature selection in the pipeline, such as ``SelectKBest``, the ``get_support`` method -would need to be used to select column names that were selected through. +would need to be used to select column names that were selected. Solution ######## From d07803986b71f30ba0a44304b028b8ed9328f8aa Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 19 Feb 2020 11:47:46 -0500 Subject: [PATCH 03/20] DOC Move slep --- {slep013 => slep014}/proposal.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename {slep013 => slep014}/proposal.rst (99%) diff --git a/slep013/proposal.rst b/slep014/proposal.rst similarity index 99% rename from slep013/proposal.rst rename to slep014/proposal.rst index 7243616..528df80 100644 --- a/slep013/proposal.rst +++ b/slep014/proposal.rst @@ -1,7 +1,7 @@ -.. _slep_013: +.. _slep_014: ============================== -SLEP013: Pandas In, Pandas Out +SLEP014: Pandas In, Pandas Out ============================== :Author: Thomas J Fan From 3147fabd66ee1a47b356350445f7528df47ac1ba Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 19 Feb 2020 11:48:42 -0500 Subject: [PATCH 04/20] DOC Adds to under review --- under_review.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/under_review.rst b/under_review.rst index 44cfc56..b9585f3 100644 --- a/under_review.rst +++ b/under_review.rst @@ -11,3 +11,4 @@ SLEPs under review slep007/proposal slep012/proposal slep013/proposal + slep014/proposal From 7647a57d076a91387f3e453619e277af79ba985c Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 19 Feb 2020 12:18:56 -0500 Subject: [PATCH 05/20] CLN Address comments --- slep014/proposal.rst | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/slep014/proposal.rst b/slep014/proposal.rst index 528df80..6cd6275 100644 --- a/slep014/proposal.rst +++ b/slep014/proposal.rst @@ -5,7 +5,7 @@ SLEP014: Pandas In, Pandas Out ============================== :Author: Thomas J Fan -:Status: Under Review +:Status: Draft :Type: Standards Track :Created: 2020-02-18 @@ -72,6 +72,10 @@ API for extracting feature names would be:: print(X_trans.columns.tolist() ['letter_a', 'letter_b', 'letter_c', 'pet_dog', 'pet_snake', 'num'] +This introduces a soft dependency on ``pandas``, which is opt-in with the +the configuration flag: ``pandas_inout``. By default, `pandas_inout` is set +to ``False``, resulting in the output of all estimators to be a ndarray. + Enabling Functionality ###################### @@ -101,6 +105,9 @@ made possible if this SLEP gets accepted. # Gets the feature names used by an estimator in the ensemble est.estimators_[0].feature_names_in_ +For options 2 and 3 the default value of configuration flag: +`store_feature_names_in` is False. + Considerations ############## From fd25211d73ed95d62fe710a2b574350209669fd5 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 19 Feb 2020 14:13:31 -0500 Subject: [PATCH 06/20] CLN --- slep014/proposal.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/slep014/proposal.rst b/slep014/proposal.rst index 6cd6275..0cfcd21 100644 --- a/slep014/proposal.rst +++ b/slep014/proposal.rst @@ -18,9 +18,9 @@ through ``scikit-learn`` estimators. Motivation ########## -``scikit-learn`` is generally used as a part of a larger data processing +``scikit-learn`` can be used as a part of a larger data processing pipeline. When this pipeline is used to transform data, the result is a -NumPy array, which discards column names. The current workflow for +NumPy array, discarding column names. The current workflow for extracting the feature names requires calling ``get_feature_names`` on the transformer that created the feature. This interface can be cumbersome when used together with a pipeline with multiple column names:: From 570f3e205dc9d7ba8fdb3dbacbbf7d411c289f9d Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 19 Feb 2020 14:45:16 -0500 Subject: [PATCH 07/20] CLN Adds more details --- slep014/proposal.rst | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/slep014/proposal.rst b/slep014/proposal.rst index 0cfcd21..10cf008 100644 --- a/slep014/proposal.rst +++ b/slep014/proposal.rst @@ -58,8 +58,8 @@ would need to be used to select column names that were selected. Solution ######## -The pandas ``DataFrame`` has been widely adopted by the Python Data ecosystem to -store data with feature names. This SLEP proposes using a ``DataFrame`` to +The pandas DataFrame has been widely adopted by the Python Data ecosystem to +store data with feature names. This SLEP proposes using a DataFrame to track the feature names as the data is transformed. With this feature, the API for extracting feature names would be:: @@ -72,8 +72,8 @@ API for extracting feature names would be:: print(X_trans.columns.tolist() ['letter_a', 'letter_b', 'letter_c', 'pet_dog', 'pet_snake', 'num'] -This introduces a soft dependency on ``pandas``, which is opt-in with the -the configuration flag: ``pandas_inout``. By default, `pandas_inout` is set +This introduces a soft dependency on pandas, which is opt-in with the +the configuration flag: ``pandas_inout``. By default, ``pandas_inout`` is set to ``False``, resulting in the output of all estimators to be a ndarray. Enabling Functionality @@ -114,7 +114,7 @@ Considerations Index alignment --------------- -Operations are index aligned when working with ``DataFrames``. Interally, +Operations are index aligned when working with DataFrames. Interally, ``scikit-learn`` will ignore the alignment by operating on the ndarray as suggested by `TomAugspurger `_:: @@ -131,7 +131,7 @@ Memory copies As noted in `pandas #27211 `_, there is not a guarantee that there is a zero-copy round-trip going from numpy -to a ``DataFrame``. In other words, the following may lead to a memory copy in +to a DataFrame. In other words, the following may lead to a memory copy in a future version of ``pandas``:: X = np.array(...) @@ -146,9 +146,9 @@ pipeline. For example, consider the following pipeline:: pipe.fit(X, y) Interally, ``StandardScaler.fit_transform`` will operate on a ndarray and -wrap the ndarray into a ``DataFrame`` as a return value. This is will be +wrap the ndarray into a DataFrame as a return value. This is will be piped into ``LogisticRegression.fit`` which calls ``check_array`` on the -``DataFrame``, which may lead to a memory copy in a future version of +DataFrame, which may lead to a memory copy in a future version of ``pandas``. This leads to unnecessary overhead from piping the data from one estimator to another. @@ -162,7 +162,9 @@ the output of all estimators will be a ndarray. Alternatives ############ -- :ref:`SLEP012 Custom InputArray Data Structure ` +- :ref:`SLEP012 Custom InputArray Data Structure `: This approach + adds another data structure in the Python Data ecosystem. This increases + the maintenance responsibilities of the ``scikit-learn`` library. References and Footnotes ------------------------ From 2d86336b4662cb19c51665bd74b7989e6d141327 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 19 Feb 2020 15:24:49 -0500 Subject: [PATCH 08/20] CLN Removes print --- slep014/proposal.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slep014/proposal.rst b/slep014/proposal.rst index 10cf008..870cc25 100644 --- a/slep014/proposal.rst +++ b/slep014/proposal.rst @@ -69,7 +69,7 @@ API for extracting feature names would be:: pipe.fit(X, y) X_trans = pipe[:-1].transform(X) - print(X_trans.columns.tolist() + X_trans.columns.tolist() ['letter_a', 'letter_b', 'letter_c', 'pet_dog', 'pet_snake', 'num'] This introduces a soft dependency on pandas, which is opt-in with the From 914f76ed895de31d95015fbb12dbc2d21000e2ba Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 19 Feb 2020 15:37:12 -0500 Subject: [PATCH 09/20] CLN Adds more detals about alternatives --- slep014/proposal.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/slep014/proposal.rst b/slep014/proposal.rst index 870cc25..7718837 100644 --- a/slep014/proposal.rst +++ b/slep014/proposal.rst @@ -166,6 +166,15 @@ Alternatives adds another data structure in the Python Data ecosystem. This increases the maintenance responsibilities of the ``scikit-learn`` library. +- Use xarray's Dataset, ``xr.Dataset``: The pandas DataFrame is more widely used + in Python's Data ecosystem, which means more libraries are built with pandas + in mind. With xarray support, users will need to convert their DataFrame into + a ``xr.Dataset``. This converstion process will be lossy when working with + pandas categorical dtypes. + +In both alternatives, the output data structure will need to be converted into +a pandas DataFrame to take advantage of the ecosytem built around pandas. + References and Footnotes ------------------------ From e326d6743cfa8fa8142f5e0e469b97dc949de15e Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 19 Feb 2020 15:43:48 -0500 Subject: [PATCH 10/20] CLN Adds details and links around round trips --- slep014/proposal.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/slep014/proposal.rst b/slep014/proposal.rst index 7718837..2ad885f 100644 --- a/slep014/proposal.rst +++ b/slep014/proposal.rst @@ -175,6 +175,12 @@ Alternatives In both alternatives, the output data structure will need to be converted into a pandas DataFrame to take advantage of the ecosytem built around pandas. +The biggest advantage of both alternatives is that they will not have the memory +copy issue. Since ``InputArray`` is designed from the ground up, we can +guarantee that it does not make memory copies during round-trips from numpy. +As stated in `xarray #3077 `, +``xarray`` guarantees that there is no copies during round-trips form numpy. + References and Footnotes ------------------------ From d13fb433fc1780a399128a2f1ed201f8715f96d8 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 19 Feb 2020 15:46:46 -0500 Subject: [PATCH 11/20] CLN Grammer --- slep014/proposal.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/slep014/proposal.rst b/slep014/proposal.rst index 2ad885f..741f211 100644 --- a/slep014/proposal.rst +++ b/slep014/proposal.rst @@ -175,11 +175,11 @@ Alternatives In both alternatives, the output data structure will need to be converted into a pandas DataFrame to take advantage of the ecosytem built around pandas. -The biggest advantage of both alternatives is that they will not have the memory +The major advantage of both alternatives is that they do not have the memory copy issue. Since ``InputArray`` is designed from the ground up, we can guarantee that it does not make memory copies during round-trips from numpy. As stated in `xarray #3077 `, -``xarray`` guarantees that there is no copies during round-trips form numpy. +``xarray`` guarantees that there is no copies during round-trips from numpy. References and Footnotes ------------------------ From b1d5a86300f6dd195ebbbb38381845c73db4fd0b Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 19 Feb 2020 15:47:57 -0500 Subject: [PATCH 12/20] CLN Grammer --- slep014/proposal.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slep014/proposal.rst b/slep014/proposal.rst index 741f211..385791b 100644 --- a/slep014/proposal.rst +++ b/slep014/proposal.rst @@ -175,7 +175,7 @@ Alternatives In both alternatives, the output data structure will need to be converted into a pandas DataFrame to take advantage of the ecosytem built around pandas. -The major advantage of both alternatives is that they do not have the memory +A major advantage of both alternatives is that they do not have the memory copy issue. Since ``InputArray`` is designed from the ground up, we can guarantee that it does not make memory copies during round-trips from numpy. As stated in `xarray #3077 `, From 8ffde790ad4e22cceba3b39535edef145636e74a Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 19 Feb 2020 15:52:06 -0500 Subject: [PATCH 13/20] CLN Fix link --- slep014/proposal.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slep014/proposal.rst b/slep014/proposal.rst index 385791b..8430a56 100644 --- a/slep014/proposal.rst +++ b/slep014/proposal.rst @@ -178,7 +178,7 @@ a pandas DataFrame to take advantage of the ecosytem built around pandas. A major advantage of both alternatives is that they do not have the memory copy issue. Since ``InputArray`` is designed from the ground up, we can guarantee that it does not make memory copies during round-trips from numpy. -As stated in `xarray #3077 `, +As stated in `xarray #3077 `_, ``xarray`` guarantees that there is no copies during round-trips from numpy. References and Footnotes From ea502da151c7428defafc97681be159f6c26a96f Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 19 Feb 2020 15:53:39 -0500 Subject: [PATCH 14/20] CLN Grammer --- slep014/proposal.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slep014/proposal.rst b/slep014/proposal.rst index 8430a56..411c76f 100644 --- a/slep014/proposal.rst +++ b/slep014/proposal.rst @@ -18,7 +18,7 @@ through ``scikit-learn`` estimators. Motivation ########## -``scikit-learn`` can be used as a part of a larger data processing +``scikit-learn`` is commonly used as a part of a larger data processing pipeline. When this pipeline is used to transform data, the result is a NumPy array, discarding column names. The current workflow for extracting the feature names requires calling ``get_feature_names`` on the From 665dd5e2f9c4b452d1d9a24b2d65bce2d95394df Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Fri, 21 Feb 2020 15:45:24 -0500 Subject: [PATCH 15/20] CLN Address comments --- slep014/proposal.rst | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/slep014/proposal.rst b/slep014/proposal.rst index 411c76f..5944f71 100644 --- a/slep014/proposal.rst +++ b/slep014/proposal.rst @@ -13,7 +13,7 @@ Abstract ######## This SLEP proposes using pandas DataFrames for propagating feature names -through ``scikit-learn`` estimators. +through ``scikit-learn`` transformers. Motivation ########## @@ -64,7 +64,7 @@ track the feature names as the data is transformed. With this feature, the API for extracting feature names would be:: from sklearn import set_config - set_config(pandas_inout=True) + set_config(pandas_in_out=True) pipe.fit(X, y) X_trans = pipe[:-1].transform(X) @@ -72,9 +72,14 @@ API for extracting feature names would be:: X_trans.columns.tolist() ['letter_a', 'letter_b', 'letter_c', 'pet_dog', 'pet_snake', 'num'] -This introduces a soft dependency on pandas, which is opt-in with the -the configuration flag: ``pandas_inout``. By default, ``pandas_inout`` is set -to ``False``, resulting in the output of all estimators to be a ndarray. +This SLEP proposes attaching feature names to the output of ``transform``. In +the above example, ``pipe[:-1].transform(X)`` propagates the feature names +through the multiple transformers. + +This feature is only available through a soft dependency on pandas. Furthermore, +it will be opt-in with the the configuration flag: ``pandas_in_out``. By +default, ``pandas_in_out`` is set to ``False``, resulting in the output of all +estimators to be a ndarray. Enabling Functionality ###################### @@ -114,7 +119,7 @@ Considerations Index alignment --------------- -Operations are index aligned when working with DataFrames. Interally, +Operations are index aligned when working with DataFrames. Internally, ``scikit-learn`` will ignore the alignment by operating on the ndarray as suggested by `TomAugspurger `_:: @@ -141,7 +146,7 @@ a future version of ``pandas``:: This is an issue for ``scikit-learn`` when estimators are placed into a pipeline. For example, consider the following pipeline:: - set_config(pandas_inout=True) + set_config(pandas_in_out=True) pipe = make_pipeline(StandardScaler(), LogisticRegression()) pipe.fit(X, y) @@ -155,7 +160,7 @@ estimator to another. Backward compatibility ###################### -The ``set_config(pandas_inout=True)`` global configuration flag will be set to +The ``set_config(pandas_in_out=True)`` global configuration flag will be set to ``False`` by default to ensure backward compatibility. When this flag is False, the output of all estimators will be a ndarray. From aed332b71c052af3b90c607466aafa09d870ce29 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 4 Mar 2020 11:34:06 -0500 Subject: [PATCH 16/20] WIP --- slep014/proposal.rst | 79 +++++++++++++++++++++++++++++++++----------- 1 file changed, 60 insertions(+), 19 deletions(-) diff --git a/slep014/proposal.rst b/slep014/proposal.rst index 5944f71..cce85f4 100644 --- a/slep014/proposal.rst +++ b/slep014/proposal.rst @@ -40,7 +40,7 @@ together with a pipeline with multiple column names:: ct = make_column_transformer( (OneHotEncoder(), orig_cat_cols), (StandardScaler(), orig_num_cols)) - pipe = make_pipeline(ct, LogisticRegression()).fit(X,y) + pipe = make_pipeline(ct, LogisticRegression()).fit(X, y) cat_names = (pipe['columntransformer'] .named_transformers_['onehotencoder'] @@ -111,7 +111,7 @@ made possible if this SLEP gets accepted. est.estimators_[0].feature_names_in_ For options 2 and 3 the default value of configuration flag: -`store_feature_names_in` is False. +``store_feature_names_in`` is False. Considerations ############## @@ -157,6 +157,16 @@ DataFrame, which may lead to a memory copy in a future version of ``pandas``. This leads to unnecessary overhead from piping the data from one estimator to another. +Sparse matrices +--------------- + +Traditionally, ``scikit-learn`` prefers to process sparse matrices in +the compressed sparse row (CSR) matrix format. The `sparse data structure `_ in pandas 1.0, only supports converting directly to +the cooridnate format (COO). Although this format was designed to quickly +convert to CSR or CSC formats, the converation process still needs to allocate +more memory to store. This can be an issue with transformers such as the +``OneHotEncoder.transform`` which has been optimized to construct a CSR matrix. + Backward compatibility ###################### @@ -167,27 +177,58 @@ the output of all estimators will be a ndarray. Alternatives ############ -- :ref:`SLEP012 Custom InputArray Data Structure `: This approach - adds another data structure in the Python Data ecosystem. This increases - the maintenance responsibilities of the ``scikit-learn`` library. +This section lists alternative data structures that can be used with their +advantages and disadvantages when compared to a pandas DataFrame. + +InputArray +---------- + +The proposed ``InputArray`` described +:ref:`SLEP012 Custom InputArray Data Structure ` introduces a new +data structure for homogenous data. + +Pros +~~~~ + +- A thin wrapper around a numpy array or a sparse matrix with a minimial feature + set that ``scikit-learn`` can evolve independently. + +Cons +~~~~ + +- Introduces yet another data structure for data storage in the PyData + ecosystem. +- Does not have a clear path to encode categorical features. + +XArray Dataset +-------------- + +`xarray's Dataset `_ +is a multi-dimenstional version of panda's DataFrame. + +Pros +~~~~ + +- xarray guartantees that there will be no copies during round-trips from + numpy. (`xarray #3077 `_) + +Cons +~~~~ + +- The `conversation from a pandas DataFrame to a Dataset `_ + is not lossless. For example, categorical dtypes in a pandas dataframe will + lose its categorical information when converted to a Dataset. +- xarray does not have as much adoption as pandas, which increases the learning + curve for using Dataset with `scikit-learn``. -- Use xarray's Dataset, ``xr.Dataset``: The pandas DataFrame is more widely used - in Python's Data ecosystem, which means more libraries are built with pandas - in mind. With xarray support, users will need to convert their DataFrame into - a ``xr.Dataset``. This converstion process will be lossy when working with - pandas categorical dtypes. +XArray DataArray +---------------- -In both alternatives, the output data structure will need to be converted into -a pandas DataFrame to take advantage of the ecosytem built around pandas. +`xarray's Data` -A major advantage of both alternatives is that they do not have the memory -copy issue. Since ``InputArray`` is designed from the ground up, we can -guarantee that it does not make memory copies during round-trips from numpy. -As stated in `xarray #3077 `_, -``xarray`` guarantees that there is no copies during round-trips from numpy. References and Footnotes ------------------------- +######################## .. [1] Each SLEP must either be explicitly labeled as placed in the public domain (see this SLEP as an example) or licensed under the `Open @@ -197,6 +238,6 @@ References and Footnotes Copyright ---------- +######### This document has been placed in the public domain. [1]_ From a999266b46f2c015542b5c94030edc3118829cf9 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Sat, 7 Mar 2020 12:30:29 -0500 Subject: [PATCH 17/20] DOC Adds more sections and details --- index.rst | 1 + slep014/proposal.rst | 50 +++++++++++++++++++++++++++++++++++++------- 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/index.rst b/index.rst index a68713e..089e2e4 100644 --- a/index.rst +++ b/index.rst @@ -12,6 +12,7 @@ slep007/proposal slep012/proposal slep013/proposal + slep014/proposal .. toctree:: :maxdepth: 1 diff --git a/slep014/proposal.rst b/slep014/proposal.rst index cce85f4..e28beca 100644 --- a/slep014/proposal.rst +++ b/slep014/proposal.rst @@ -174,6 +174,28 @@ The ``set_config(pandas_in_out=True)`` global configuration flag will be set to ``False`` by default to ensure backward compatibility. When this flag is False, the output of all estimators will be a ndarray. +Community Adoption +################## + +With the new ``pandas_in_out`` configuration flag, third party libraries may +need to query the configuration flag to be fully compliant with this SLEP. +Specifically, "to be fully compliant" entails the following policy: + +1. If ``pandas_in_out=False``, then ``transform`` always returns numpy array. +2. If ``pandas_in_out=True``, then ``transform`` returns a DataFrame if the + input is a Dataframe. + +This policy can either be enforced with ``check_estimator`` or not: + +- **Enforce**: This increases the maintaince burden of third party libraries. + This burden includes: checking for the configuration flag, generating feature names and including pandas as a dependency to their library. + +- **Not Enforce**: Currently, third party transformers can return a DataFrame + or a numpy and this is mostly compatible with ``scikit-learn``. Users with + third party transformers would not be able to access the features enabled + by this SLEP. + + Alternatives ############ @@ -196,9 +218,9 @@ Pros Cons ~~~~ -- Introduces yet another data structure for data storage in the PyData - ecosystem. -- Does not have a clear path to encode categorical features. +- Introduces another data structure for data storage in the PyData ecosystem. +- Currently, the design only allows for homogenous data. +- Increases maintenance responsibilities for ``scikit-learn``. XArray Dataset -------------- @@ -209,23 +231,37 @@ is a multi-dimenstional version of panda's DataFrame. Pros ~~~~ -- xarray guartantees that there will be no copies during round-trips from - numpy. (`xarray #3077 `_) +- Can be used for heterogeneous data. Cons ~~~~ +- ``scikit-learn`` does not require many of the features Dataset provides. +- Needs to be converted to a DataArray before it can be converted to a numpy array. - The `conversation from a pandas DataFrame to a Dataset `_ is not lossless. For example, categorical dtypes in a pandas dataframe will lose its categorical information when converted to a Dataset. - xarray does not have as much adoption as pandas, which increases the learning - curve for using Dataset with `scikit-learn``. + curve for using Dataset with ``scikit-learn``. XArray DataArray ---------------- -`xarray's Data` +`xarray's DataArray `_ +is a data structure that store homogenous data. + +Pros +~~~~ + +- xarray guartantees that there will be no copies during round-trips from + numpy. (`xarray #3077 `_) + +Cons +~~~~ +- Can only be used for homogenous data. +- As with XArray's Dataset, DataArray does not have much adoption as pandas, + which increases the learning curve for using DataArray with ``scikit-learn``. References and Footnotes ######################## From 4588fc41908eaa8e3a3988addb00084e14a5f888 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Sun, 8 Mar 2020 17:33:22 -0400 Subject: [PATCH 18/20] CLN Address comments --- slep014/proposal.rst | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/slep014/proposal.rst b/slep014/proposal.rst index e28beca..8f50216 100644 --- a/slep014/proposal.rst +++ b/slep014/proposal.rst @@ -116,21 +116,6 @@ For options 2 and 3 the default value of configuration flag: Considerations ############## -Index alignment ---------------- - -Operations are index aligned when working with DataFrames. Internally, -``scikit-learn`` will ignore the alignment by operating on the ndarray as -suggested by `TomAugspurger `_:: - - def transform(self, X, y=None): - X, row_labels, input_type = check_array(X) - # X is a ndarray - result = ... - # some hypothetical function that recreates a DataFrame / DataArray, - # preserving row labels, attaching new features names. - return construct_result(result, output_feature_names, row_labels, input_type) - Memory copies ------------- @@ -161,9 +146,9 @@ Sparse matrices --------------- Traditionally, ``scikit-learn`` prefers to process sparse matrices in -the compressed sparse row (CSR) matrix format. The `sparse data structure `_ in pandas 1.0, only supports converting directly to -the cooridnate format (COO). Although this format was designed to quickly -convert to CSR or CSC formats, the converation process still needs to allocate +the compressed sparse row (CSR) matrix format. The `sparse data structure `_ in pandas 1.0 only supports converting directly to +the coordinate format (COO). Although this format was designed to quickly +convert to CSR or CSC formats, the conversion process still needs to allocate more memory to store. This can be an issue with transformers such as the ``OneHotEncoder.transform`` which has been optimized to construct a CSR matrix. @@ -238,9 +223,9 @@ Cons - ``scikit-learn`` does not require many of the features Dataset provides. - Needs to be converted to a DataArray before it can be converted to a numpy array. -- The `conversation from a pandas DataFrame to a Dataset `_ +- The `conversion from a pandas DataFrame to a Dataset `_ is not lossless. For example, categorical dtypes in a pandas dataframe will - lose its categorical information when converted to a Dataset. + lose their categorical information when converted to a Dataset. - xarray does not have as much adoption as pandas, which increases the learning curve for using Dataset with ``scikit-learn``. @@ -253,14 +238,14 @@ is a data structure that store homogenous data. Pros ~~~~ -- xarray guartantees that there will be no copies during round-trips from +- xarray guarantees that there will be no copies during round-trips from numpy. (`xarray #3077 `_) Cons ~~~~ - Can only be used for homogenous data. -- As with XArray's Dataset, DataArray does not have much adoption as pandas, +- As with XArray's Dataset, DataArray does not as much adoption as pandas, which increases the learning curve for using DataArray with ``scikit-learn``. References and Footnotes From 85c1c7c1f69414e1fb73a091ca2f1fe42c3c9a79 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 28 Nov 2022 12:54:33 -0500 Subject: [PATCH 19/20] Apply suggestions from code review Co-authored-by: Joel Nothman --- slep014/proposal.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slep014/proposal.rst b/slep014/proposal.rst index 8f50216..59f1e87 100644 --- a/slep014/proposal.rst +++ b/slep014/proposal.rst @@ -53,7 +53,7 @@ passed into ``LogisticRegression``. As demonstrated above, the process of extracting ``feature_names`` requires knowing the order of the selected categories in the ``ColumnTransformer``. Furthemore, if there is feature selection in the pipeline, such as ``SelectKBest``, the ``get_support`` method -would need to be used to select column names that were selected. +would need to be used to determine column names that were selected. Solution ######## From 653e476588537b735f973f80bf211e32f96920da Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 28 Nov 2022 13:01:00 -0500 Subject: [PATCH 20/20] DOC Move proposal to rejected --- rejected.rst | 4 ---- slep014/proposal.rst | 6 +++--- 2 files changed, 3 insertions(+), 7 deletions(-) delete mode 100644 rejected.rst diff --git a/rejected.rst b/rejected.rst deleted file mode 100644 index 42799a4..0000000 --- a/rejected.rst +++ /dev/null @@ -1,4 +0,0 @@ -Rejected SLEPs -============== - -Nothing here diff --git a/slep014/proposal.rst b/slep014/proposal.rst index 59f1e87..adf8fbc 100644 --- a/slep014/proposal.rst +++ b/slep014/proposal.rst @@ -5,7 +5,7 @@ SLEP014: Pandas In, Pandas Out ============================== :Author: Thomas J Fan -:Status: Draft +:Status: Rejected :Type: Standards Track :Created: 2020-02-18 @@ -32,7 +32,7 @@ together with a pipeline with multiple column names:: from sklearn.pipeline import make_pipeline from sklearn.linear_model import LogisticRegression - X = pd.DataFrame({'letter': ['a', 'b', 'c'], + X = pd.DataFrame({'letter': ['a', 'b', 'c'], 'pet': ['dog', 'snake', 'dog'], 'num': [1, 2, 3]}) y = [0, 0, 1] @@ -223,7 +223,7 @@ Cons - ``scikit-learn`` does not require many of the features Dataset provides. - Needs to be converted to a DataArray before it can be converted to a numpy array. -- The `conversion from a pandas DataFrame to a Dataset `_ +- The `conversion from a pandas DataFrame to a Dataset `_ is not lossless. For example, categorical dtypes in a pandas dataframe will lose their categorical information when converted to a Dataset. - xarray does not have as much adoption as pandas, which increases the learning