From 4dbb97758e6833a872982bb6cb95960aa08f5f52 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 9 Dec 2022 17:15:09 +0100 Subject: [PATCH 1/7] DEV: remove downstream test packages from environment.yml --- environment.yml | 18 ++++++------------ requirements-dev.txt | 14 ++++---------- scripts/generate_pip_deps_from_conda.py | 2 +- 3 files changed, 11 insertions(+), 23 deletions(-) diff --git a/environment.yml b/environment.yml index 70884f4ca98a3..640e86045abdf 100644 --- a/environment.yml +++ b/environment.yml @@ -3,7 +3,7 @@ name: pandas-dev channels: - conda-forge dependencies: - - python=3.8 + - python=3.10 - pip # build dependencies @@ -17,6 +17,7 @@ dependencies: - psutil - pytest-asyncio>=0.17 - boto3 + - coverage # required dependencies - python-dateutil @@ -27,12 +28,14 @@ dependencies: - beautifulsoup4 - blosc - brotlipy + - botocore - bottleneck - fastparquet - fsspec - html5lib - hypothesis - gcsfs + - ipython - jinja2 - lxml - matplotlib @@ -41,6 +44,7 @@ dependencies: - openpyxl - odfpy - pandas-gbq + - py - psycopg2 - pyarrow<10 - pymysql @@ -60,17 +64,7 @@ dependencies: # downstream packages - aiobotocore<2.0.0 # GH#44311 pinned to fix docbuild - - botocore - - cftime - - dask - - ipython - - seaborn - - scikit-learn - - statsmodels - - coverage - - pandas-datareader - - pyyaml - - py + - dask-core # local testing dependencies - moto diff --git a/requirements-dev.txt b/requirements-dev.txt index caa3dd49add3b..8f4039ba9f665 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -10,18 +10,21 @@ pytest-xdist>=1.31 psutil pytest-asyncio>=0.17 boto3 +coverage python-dateutil numpy pytz beautifulsoup4 blosc brotlipy +botocore bottleneck fastparquet fsspec html5lib hypothesis gcsfs +ipython jinja2 lxml matplotlib @@ -30,6 +33,7 @@ numexpr>=2.8.0 openpyxl odfpy pandas-gbq +py psycopg2-binary pyarrow<10 pymysql @@ -47,17 +51,7 @@ xlrd xlsxwriter zstandard aiobotocore<2.0.0 -botocore -cftime dask -ipython -seaborn -scikit-learn -statsmodels -coverage -pandas-datareader -pyyaml -py moto flask asv diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py index f25ac9a24b98b..8c2b0111949d2 100755 --- a/scripts/generate_pip_deps_from_conda.py +++ b/scripts/generate_pip_deps_from_conda.py @@ -24,8 +24,8 @@ REMAP_VERSION = {"tzdata": "2022.1"} RENAME = { "pytables": "tables", - "geopandas-base": "geopandas", "psycopg2": "psycopg2-binary", + "dask-core": "dask", } From 630d17ea7066b171f709969bde4823dcce852ddd Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 9 Dec 2022 20:22:01 +0100 Subject: [PATCH 2/7] undo python change, add seaborn-base --- environment.yml | 3 ++- requirements-dev.txt | 1 + scripts/generate_pip_deps_from_conda.py | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 640e86045abdf..c3a07080db55a 100644 --- a/environment.yml +++ b/environment.yml @@ -3,7 +3,7 @@ name: pandas-dev channels: - conda-forge dependencies: - - python=3.10 + - python=3.8 - pip # build dependencies @@ -65,6 +65,7 @@ dependencies: # downstream packages - aiobotocore<2.0.0 # GH#44311 pinned to fix docbuild - dask-core + - seaborn-base # local testing dependencies - moto diff --git a/requirements-dev.txt b/requirements-dev.txt index 8f4039ba9f665..a6baa4cd9f004 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -52,6 +52,7 @@ xlsxwriter zstandard aiobotocore<2.0.0 dask +sseaborn moto flask asv diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py index 8c2b0111949d2..fba53a7c171dd 100755 --- a/scripts/generate_pip_deps_from_conda.py +++ b/scripts/generate_pip_deps_from_conda.py @@ -26,6 +26,7 @@ "pytables": "tables", "psycopg2": "psycopg2-binary", "dask-core": "dask", + "seaborn-base": "sseaborn", } From fcac8febfc02bead2e89a17a0ac5c58c5c7de95f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 9 Dec 2022 21:19:41 +0100 Subject: [PATCH 3/7] typo --- requirements-dev.txt | 2 +- scripts/generate_pip_deps_from_conda.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index a6baa4cd9f004..ff83e2b985874 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -52,7 +52,7 @@ xlsxwriter zstandard aiobotocore<2.0.0 dask -sseaborn +seaborn moto flask asv diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py index fba53a7c171dd..8190104428724 100755 --- a/scripts/generate_pip_deps_from_conda.py +++ b/scripts/generate_pip_deps_from_conda.py @@ -26,7 +26,7 @@ "pytables": "tables", "psycopg2": "psycopg2-binary", "dask-core": "dask", - "seaborn-base": "sseaborn", + "seaborn-base": "seaborn", } From b1c70c719f4d5c3c0f3f2fbec95d3ecff4b38721 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Sat, 10 Dec 2022 20:02:50 +0000 Subject: [PATCH 4/7] use plain code block for statsmodels whatsnew note --- doc/source/whatsnew/v0.16.2.rst | 54 ++++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v0.16.2.rst b/doc/source/whatsnew/v0.16.2.rst index c6c134a383e11..ba20c3ba9ac43 100644 --- a/doc/source/whatsnew/v0.16.2.rst +++ b/doc/source/whatsnew/v0.16.2.rst @@ -61,21 +61,45 @@ In the example above, the functions ``f``, ``g``, and ``h`` each expected the Da When the function you wish to apply takes its data anywhere other than the first argument, pass a tuple of ``(function, keyword)`` indicating where the DataFrame should flow. For example: -.. ipython:: python - :okwarning: - - import statsmodels.formula.api as sm - - bb = pd.read_csv("data/baseball.csv", index_col="id") - - # sm.ols takes (formula, data) - ( - bb.query("h > 0") - .assign(ln_h=lambda df: np.log(df.h)) - .pipe((sm.ols, "data"), "hr ~ ln_h + year + g + C(lg)") - .fit() - .summary() - ) +.. code-block:: ipython + + In [1]: import statsmodels.formula.api as sm + + In [2]: bb = pd.read_csv('data/baseball.csv', index_col='id') + + # sm.poisson takes (formula, data) + In [3]: (bb.query('h > 0') + ...: .assign(ln_h = lambda df: np.log(df.h)) + ...: .pipe((sm.poisson, 'data'), 'hr ~ ln_h + year + g + C(lg)') + ...: .fit() + ...: .summary() + ...: ) + ...: + Optimization terminated successfully. + Current function value: 2.116284 + Iterations 24 + Out[3]: + + """ + Poisson Regression Results + ============================================================================== + Dep. Variable: hr No. Observations: 68 + Model: Poisson Df Residuals: 63 + Method: MLE Df Model: 4 + Date: Sat, 13 Jun 2015 Pseudo R-squ.: 0.6878 + Time: 15:07:13 Log-Likelihood: -143.91 + converged: True LL-Null: -460.91 + LLR p-value: 6.774e-136 + =============================================================================== + coef std err z P>|z| [95.0% Conf. Int.] + ------------------------------------------------------------------------------- + Intercept -1267.3636 457.867 -2.768 0.006 -2164.767 -369.960 + C(lg)[T.NL] -0.2057 0.101 -2.044 0.041 -0.403 -0.008 + ln_h 0.9280 0.191 4.866 0.000 0.554 1.302 + year 0.6301 0.228 2.762 0.006 0.183 1.077 + g 0.0099 0.004 2.754 0.006 0.003 0.017 + =============================================================================== + """ The pipe method is inspired by unix pipes, which stream text through processes. More recently dplyr_ and magrittr_ have introduced the From a16e793bc9a789b9737f67687c991705d61d3fdd Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Sun, 11 Dec 2022 11:22:23 +0000 Subject: [PATCH 5/7] use code-block in user guide --- doc/source/user_guide/basics.rst | 60 +++++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 13 deletions(-) diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 2204c8b04e438..9755e008a2b20 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -827,20 +827,54 @@ In this case, provide ``pipe`` with a tuple of ``(callable, data_keyword)``. For example, we can fit a regression using statsmodels. Their API expects a formula first and a ``DataFrame`` as the second argument, ``data``. We pass in the function, keyword pair ``(sm.ols, 'data')`` to ``pipe``: -.. ipython:: python - :okwarning: - - import statsmodels.formula.api as sm - - bb = pd.read_csv("data/baseball.csv", index_col="id") +.. code-block:: ipython - ( - bb.query("h > 0") - .assign(ln_h=lambda df: np.log(df.h)) - .pipe((sm.ols, "data"), "hr ~ ln_h + year + g + C(lg)") - .fit() - .summary() - ) + In [147]: import statsmodels.formula.api as sm + + In [148]: bb = pd.read_csv("data/baseball.csv", index_col="id") + + In [149]: ( + .....: bb.query("h > 0") + .....: .assign(ln_h=lambda df: np.log(df.h)) + .....: .pipe((sm.ols, "data"), "hr ~ ln_h + year + g + C(lg)") + .....: .fit() + .....: .summary() + .....: ) + .....: + Out[149]: + + """ + OLS Regression Results + ============================================================================== + Dep. Variable: hr R-squared: 0.685 + Model: OLS Adj. R-squared: 0.665 + Method: Least Squares F-statistic: 34.28 + Date: Tue, 22 Nov 2022 Prob (F-statistic): 3.48e-15 + Time: 05:34:17 Log-Likelihood: -205.92 + No. Observations: 68 AIC: 421.8 + Df Residuals: 63 BIC: 432.9 + Df Model: 4 + Covariance Type: nonrobust + =============================================================================== + coef std err t P>|t| [0.025 0.975] + ------------------------------------------------------------------------------- + Intercept -8484.7720 4664.146 -1.819 0.074 -1.78e+04 835.780 + C(lg)[T.NL] -2.2736 1.325 -1.716 0.091 -4.922 0.375 + ln_h -1.3542 0.875 -1.547 0.127 -3.103 0.395 + year 4.2277 2.324 1.819 0.074 -0.417 8.872 + g 0.1841 0.029 6.258 0.000 0.125 0.243 + ============================================================================== + Omnibus: 10.875 Durbin-Watson: 1.999 + Prob(Omnibus): 0.004 Jarque-Bera (JB): 17.298 + Skew: 0.537 Prob(JB): 0.000175 + Kurtosis: 5.225 Cond. No. 1.49e+07 + ============================================================================== + + Notes: + [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. + [2] The condition number is large, 1.49e+07. This might indicate that there are + strong multicollinearity or other numerical problems. + """ The pipe method is inspired by unix pipes and more recently dplyr_ and magrittr_, which have introduced the popular ``(%>%)`` (read pipe) operator for R_. From 4194f5bfbbd53a4bb067c4a4d7a90f908b7dc435 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Sun, 11 Dec 2022 13:06:53 +0000 Subject: [PATCH 6/7] fixup 0.16.2 whatsnew --- doc/source/whatsnew/v0.16.2.rst | 62 +++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 26 deletions(-) diff --git a/doc/source/whatsnew/v0.16.2.rst b/doc/source/whatsnew/v0.16.2.rst index ba20c3ba9ac43..ef73c4b092fc1 100644 --- a/doc/source/whatsnew/v0.16.2.rst +++ b/doc/source/whatsnew/v0.16.2.rst @@ -65,40 +65,50 @@ of ``(function, keyword)`` indicating where the DataFrame should flow. For examp In [1]: import statsmodels.formula.api as sm - In [2]: bb = pd.read_csv('data/baseball.csv', index_col='id') - - # sm.poisson takes (formula, data) - In [3]: (bb.query('h > 0') - ...: .assign(ln_h = lambda df: np.log(df.h)) - ...: .pipe((sm.poisson, 'data'), 'hr ~ ln_h + year + g + C(lg)') - ...: .fit() - ...: .summary() + In [2]: bb = pd.read_csv("data/baseball.csv", index_col="id") + + # sm.ols takes (formula, data) + In [3]: ( + ...: bb.query("h > 0") + ...: .assign(ln_h=lambda df: np.log(df.h)) + ...: .pipe((sm.ols, "data"), "hr ~ ln_h + year + g + C(lg)") + ...: .fit() + ...: .summary() ...: ) ...: - Optimization terminated successfully. - Current function value: 2.116284 - Iterations 24 Out[3]: """ - Poisson Regression Results + OLS Regression Results ============================================================================== - Dep. Variable: hr No. Observations: 68 - Model: Poisson Df Residuals: 63 - Method: MLE Df Model: 4 - Date: Sat, 13 Jun 2015 Pseudo R-squ.: 0.6878 - Time: 15:07:13 Log-Likelihood: -143.91 - converged: True LL-Null: -460.91 - LLR p-value: 6.774e-136 + Dep. Variable: hr R-squared: 0.685 + Model: OLS Adj. R-squared: 0.665 + Method: Least Squares F-statistic: 34.28 + Date: Tue, 22 Nov 2022 Prob (F-statistic): 3.48e-15 + Time: 05:35:23 Log-Likelihood: -205.92 + No. Observations: 68 AIC: 421.8 + Df Residuals: 63 BIC: 432.9 + Df Model: 4 + Covariance Type: nonrobust =============================================================================== - coef std err z P>|z| [95.0% Conf. Int.] + coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------- - Intercept -1267.3636 457.867 -2.768 0.006 -2164.767 -369.960 - C(lg)[T.NL] -0.2057 0.101 -2.044 0.041 -0.403 -0.008 - ln_h 0.9280 0.191 4.866 0.000 0.554 1.302 - year 0.6301 0.228 2.762 0.006 0.183 1.077 - g 0.0099 0.004 2.754 0.006 0.003 0.017 - =============================================================================== + Intercept -8484.7720 4664.146 -1.819 0.074 -1.78e+04 835.780 + C(lg)[T.NL] -2.2736 1.325 -1.716 0.091 -4.922 0.375 + ln_h -1.3542 0.875 -1.547 0.127 -3.103 0.395 + year 4.2277 2.324 1.819 0.074 -0.417 8.872 + g 0.1841 0.029 6.258 0.000 0.125 0.243 + ============================================================================== + Omnibus: 10.875 Durbin-Watson: 1.999 + Prob(Omnibus): 0.004 Jarque-Bera (JB): 17.298 + Skew: 0.537 Prob(JB): 0.000175 + Kurtosis: 5.225 Cond. No. 1.49e+07 + ============================================================================== + + Notes: + [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. + [2] The condition number is large, 1.49e+07. This might indicate that there are + strong multicollinearity or other numerical problems. """ The pipe method is inspired by unix pipes, which stream text through From 8a71912730dff05b2d684c530908209db39d8dac Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 11 Dec 2022 20:59:55 +0100 Subject: [PATCH 7/7] also remove pandas-gbq --- ci/deps/actions-38-downstream_compat.yaml | 2 +- environment.yml | 1 - requirements-dev.txt | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/ci/deps/actions-38-downstream_compat.yaml b/ci/deps/actions-38-downstream_compat.yaml index 15ce02204ee99..fcd453b9d9fee 100644 --- a/ci/deps/actions-38-downstream_compat.yaml +++ b/ci/deps/actions-38-downstream_compat.yaml @@ -39,7 +39,6 @@ dependencies: - numexpr - openpyxl - odfpy - - pandas-gbq - psycopg2 - pyarrow<10 - pymysql @@ -68,5 +67,6 @@ dependencies: - statsmodels - coverage - pandas-datareader + - pandas-gbq - pyyaml - py diff --git a/environment.yml b/environment.yml index c3a07080db55a..5a26f8fd1520c 100644 --- a/environment.yml +++ b/environment.yml @@ -43,7 +43,6 @@ dependencies: - numexpr>=2.8.0 # pin for "Run checks on imported code" job - openpyxl - odfpy - - pandas-gbq - py - psycopg2 - pyarrow<10 diff --git a/requirements-dev.txt b/requirements-dev.txt index ff83e2b985874..f6378ddd2e18d 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -32,7 +32,6 @@ numba>=0.53.1 numexpr>=2.8.0 openpyxl odfpy -pandas-gbq py psycopg2-binary pyarrow<10