ENH: Enhancement for spss kwargs (#56434)

astronights · web-flow · commit c375533d670a · 2024-08-27T09:49:42.000-07:00
* Bug fix for spss kwargs

* Update to whatsnew with bug fix details

* Black formatting fix

* Fix for whatsnew rst pre-commit

* Sort whatsnew alphabetically

* Fixing alphabetical sorting to whatsnew

* Sorting whatsnew

* Fixing kwargs

* Fixing tests

* Test fixes

* Minor change in kwargs

* Resolving PR comments

* Resolving mypy issue

* Docstring update for kwargs mypy

* Updates based on PR comments

* Fixing kwargs types

* Finalizing updates based on suggestions

* Updating to try to fix other random test fails

* Unrelated test fix

* Test fixes for dims/sizes

* Updates to whatsnew

* Case fix

* Update to reword to enhancements
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -32,6 +32,7 @@ Other enhancements
 - :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`)
 - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`)
 - :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`)
+- :func:`read_spss` now supports kwargs to be passed to pyreadstat (:issue:`56356`)
 - :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`)
 - :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`).
 - :meth:`Index.get_loc` now accepts also subclasses of ``tuple`` as keys (:issue:`57922`)
diff --git a/pandas/io/spss.py b/pandas/io/spss.py
@@ -1,6 +1,9 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+from typing import (
+    TYPE_CHECKING,
+    Any,
+)
 
 from pandas._libs import lib
 from pandas.compat._optional import import_optional_dependency
@@ -24,6 +27,7 @@ def read_spss(
     usecols: Sequence[str] | None = None,
     convert_categoricals: bool = True,
     dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
+    **kwargs: Any,
 ) -> DataFrame:
     """
     Load an SPSS file from the file path, returning a DataFrame.
@@ -47,6 +51,10 @@ def read_spss(
           nullable :class:`ArrowDtype` :class:`DataFrame`
 
         .. versionadded:: 2.0
+    **kwargs
+        Additional keyword arguments that can be passed to :func:`pyreadstat.read_sav`.
+
+        .. versionadded:: 3.0
 
     Returns
     -------
@@ -74,7 +82,10 @@ def read_spss(
         usecols = list(usecols)  # pyreadstat requires a list
 
     df, metadata = pyreadstat.read_sav(
-        stringify_path(path), usecols=usecols, apply_value_formats=convert_categoricals
+        stringify_path(path),
+        usecols=usecols,
+        apply_value_formats=convert_categoricals,
+        **kwargs,
     )
     df.attrs = metadata.__dict__
     if dtype_backend is not lib.no_default:
diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py
@@ -65,6 +65,22 @@ def test_spss_labelled_str(datapath):
     tm.assert_frame_equal(df, expected)
 
 
+@pytest.mark.filterwarnings("ignore::pandas.errors.ChainedAssignmentError")
+@pytest.mark.filterwarnings("ignore:ChainedAssignmentError:FutureWarning")
+def test_spss_kwargs(datapath):
+    # test file from the Haven project (https://haven.tidyverse.org/)
+    # Licence at LICENSES/HAVEN_LICENSE, LICENSES/HAVEN_MIT
+    fname = datapath("io", "data", "spss", "labelled-str.sav")
+
+    df = pd.read_spss(fname, convert_categoricals=True, row_limit=1)
+    expected = pd.DataFrame({"gender": ["Male"]}, dtype="category")
+    tm.assert_frame_equal(df, expected)
+
+    df = pd.read_spss(fname, convert_categoricals=False, row_offset=1)
+    expected = pd.DataFrame({"gender": ["F"]})
+    tm.assert_frame_equal(df, expected)
+
+
 @pytest.mark.filterwarnings("ignore::pandas.errors.ChainedAssignmentError")
 @pytest.mark.filterwarnings("ignore:ChainedAssignmentError:FutureWarning")
 def test_spss_umlauts(datapath):