From e3ab1100863a8d04d1691db78766982650571322 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 10 Sep 2019 17:01:48 +0200 Subject: [PATCH 01/15] add PeriodType arrow extension type --- pandas/core/arrays/period.py | 57 ++++++++++++++++++++++++++++++ pandas/tests/arrays/test_period.py | 42 ++++++++++++++++++++++ 2 files changed, 99 insertions(+) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index f2d74794eadf5..50bf795f4109d 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -1,4 +1,6 @@ from datetime import timedelta +from distutils.version import LooseVersion +import json import operator from typing import Any, Callable, List, Optional, Sequence, Union @@ -50,6 +52,13 @@ from pandas.tseries import frequencies from pandas.tseries.offsets import DateOffset, Tick, _delta_to_tick +try: + import pyarrow + + _PYARROW_INSTALLED = True +except ImportError: + _PYARROW_INSTALLED = False + def _field_accessor(name, alias, docstring=None): def f(self): @@ -338,6 +347,19 @@ def __array__(self, dtype=None): # overriding DatetimelikeArray return np.array(list(self), dtype=object) + def __arrow_array__(self, type=None): + """ + Convert myself into a pyarrow Array. + """ + import pyarrow as pa + + if type is not None and not isinstance(type, PeriodType): + raise TypeError("not supported") + + period_type = PeriodType(self.freqstr) + storage_array = pa.array(self._data, mask=self.isna(), type="int64") + return pa.ExtensionArray.from_storage(period_type, storage_array) + # -------------------------------------------------------------------- # Vectorized analogues of Period properties @@ -1089,3 +1111,38 @@ def _make_field_arrays(*fields): ] return arrays + + +if _PYARROW_INSTALLED and ( + LooseVersion(pyarrow.__version__) >= LooseVersion("0.14.1.dev") +): + + class PeriodType(pyarrow.ExtensionType): + def __init__(self, freq): + # attributes need to be set first before calling + # super init (as that calls serialize) + self._freq = freq + pyarrow.ExtensionType.__init__(self, pyarrow.int64(), "pandas.period") + + @property + def freq(self): + return self._freq + + def __arrow_ext_serialize__(self): + metadata = {"freq": self.freq} + return json.dumps(metadata).encode() + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + metadata = json.loads(serialized.decode()) + return PeriodType(metadata["freq"]) + + def __eq__(self, other): + if isinstance(other, pyarrow.BaseExtensionType): + return type(self) == type(other) and self.freq == other.freq + else: + return NotImplemented + + # register the type with a dummy instance + _period_type = PeriodType("D") + pyarrow.register_extension_type(_period_type) diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index 252f278242fcc..9eb8456e55647 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -1,3 +1,5 @@ +from distutils.version import LooseVersion + import numpy as np import pytest @@ -10,6 +12,14 @@ from pandas.core.arrays import PeriodArray, period_array import pandas.util.testing as tm +try: + import pyarrow + + _PYARROW_INSTALLED = True +except ImportError: + _PYARROW_INSTALLED = False + + # ---------------------------------------------------------------------------- # Dtype @@ -323,3 +333,35 @@ def test_min_max_empty(self, skipna): result = arr.max(skipna=skipna) assert result is pd.NaT + + +# ---------------------------------------------------------------------------- +# Arrow interaction + + +@pytest.mark.skipif( + not _PYARROW_INSTALLED + or _PYARROW_INSTALLED + and LooseVersion(pyarrow.__version__) < LooseVersion("0.14.1.dev"), + reason="pyarrow >= 0.15.0 required", +) +@pytest.mark.parametrize( + "data, freq", + [ + (pd.date_range("2017", periods=3), "D"), + (pd.date_range("2017", periods=3, freq="A"), "A-DEC"), + ], +) +def test_arrow_array(data, freq): + import pyarrow as pa + from pandas.core.arrays.period import PeriodType + + periods = period_array(data, freq=freq) + arr = pa.array(periods) + assert isinstance(arr.type, PeriodType) + assert arr.type.freq == freq + expected = pa.array(periods.asi8, type="int64") + assert arr.storage.equals(expected) + + with pytest.raises(TypeError): + pa.array(periods, type="float64") From 6c1300f49ec825666985212ac8533a2e054d61c1 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 10 Sep 2019 17:30:24 +0200 Subject: [PATCH 02/15] add IntervalType arrow extension type --- pandas/core/arrays/interval.py | 79 +++++++++++++++++++ pandas/tests/arrays/interval/test_interval.py | 32 ++++++++ 2 files changed, 111 insertions(+) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 1f4b76a259f00..1901db9a9ad7e 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1,3 +1,5 @@ +from distutils.version import LooseVersion +import json from operator import le, lt import textwrap @@ -39,6 +41,14 @@ import pandas.core.common as com from pandas.core.indexes.base import ensure_index +try: + import pyarrow + + _PYARROW_INSTALLED = True +except ImportError: + _PYARROW_INSTALLED = False + + _VALID_CLOSED = {"left", "right", "both", "neither"} _interval_shared_docs = {} @@ -1035,6 +1045,27 @@ def __array__(self, dtype=None): result[i] = Interval(left[i], right[i], closed) return result + def __arrow_array__(self, type=None): + """ + Convert myself into a pyarrow Array. + """ + import pyarrow as pa + + if type is not None and not isinstance(type, IntervalType): + raise TypeError("not supported") + + # TODO better conversion to arrow type, handle missing values + subtype = pa.type_for_alias(str(self.dtype.subtype)) + interval_type = IntervalType(subtype, self.closed) + storage_array = pa.StructArray.from_arrays( + [ + pa.array(self.left, type=subtype, from_pandas=True), + pa.array(self.right, type=subtype, from_pandas=True), + ], + names=["left", "right"], + ) + return pa.ExtensionArray.from_storage(interval_type, storage_array) + _interval_shared_docs[ "to_tuples" ] = """ @@ -1226,3 +1257,51 @@ def maybe_convert_platform_interval(values): values = np.asarray(values) return maybe_convert_platform(values) + + +if _PYARROW_INSTALLED and ( + LooseVersion(pyarrow.__version__) >= LooseVersion("0.14.1.dev") +): + + class IntervalType(pyarrow.ExtensionType): + def __init__(self, subtype, closed): + # attributes need to be set first before calling + # super init (as that calls serialize) + assert closed in _VALID_CLOSED + self._subtype = pyarrow.type_for_alias(str(subtype)) + self._closed = closed + storage_type = pyarrow.struct([("left", subtype), ("right", subtype)]) + pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval") + + @property + def subtype(self): + return self._subtype + + @property + def closed(self): + return self._closed + + def __arrow_ext_serialize__(self): + metadata = {"subtype": str(self.subtype), "closed": self.closed} + return json.dumps(metadata).encode() + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + metadata = json.loads(serialized.decode()) + subtype = pyarrow.type_for_alias(metadata["subtype"]) + closed = metadata["closed"] + return IntervalType(subtype, closed) + + def __eq__(self, other): + if isinstance(other, pyarrow.BaseExtensionType): + return ( + type(self) == type(other) + and self.subtype == other.subtupe + and self.closed == other.closed + ) + else: + return NotImplemented + + # register the type with a dummy instance + _interval_type = IntervalType(pyarrow.int64(), "left") + pyarrow.register_extension_type(_interval_type) diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index 655a6e717119b..7c753ae4f413b 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -1,3 +1,5 @@ +from distutils.version import LooseVersion + import numpy as np import pytest @@ -14,6 +16,13 @@ from pandas.core.arrays import IntervalArray import pandas.util.testing as tm +try: + import pyarrow + + _PYARROW_INSTALLED = True +except ImportError: + _PYARROW_INSTALLED = False + @pytest.fixture( params=[ @@ -103,3 +112,26 @@ def test_repr(): "Length: 2, closed: right, dtype: interval[int64]" ) assert result == expected + + +@pytest.mark.skipif( + not _PYARROW_INSTALLED + or _PYARROW_INSTALLED + and LooseVersion(pyarrow.__version__) < LooseVersion("0.14.1.dev"), + reason="pyarrow >= 0.15.0 required", +) +def test_arrow_array(): + import pyarrow as pa + from pandas.core.arrays.interval import IntervalType + + intervals = pd.interval_range(1, 5, freq=1).array + + arr = pa.array(intervals) + assert isinstance(arr.type, IntervalType) + assert arr.type.closed == intervals.closed + assert arr.type.subtype == pa.int64() + + assert arr.storage.field("left").equals(pa.array([1, 2, 3, 4], type="int64")) + + with pytest.raises(TypeError): + pa.array(intervals, type="float64") From 5eb8ad66c57f827b45e19af615b9c4acc31cedd7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 10 Sep 2019 21:57:33 +0200 Subject: [PATCH 03/15] rename + make hashable --- pandas/core/arrays/interval.py | 15 ++++++----- pandas/core/arrays/period.py | 13 ++++++---- pandas/tests/arrays/interval/test_interval.py | 25 +++++++++++++++++-- pandas/tests/arrays/test_period.py | 24 ++++++++++++++++-- 4 files changed, 62 insertions(+), 15 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 1901db9a9ad7e..e6062d9e8e1c8 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1051,12 +1051,12 @@ def __arrow_array__(self, type=None): """ import pyarrow as pa - if type is not None and not isinstance(type, IntervalType): + if type is not None and not isinstance(type, ArrowIntervalType): raise TypeError("not supported") # TODO better conversion to arrow type, handle missing values subtype = pa.type_for_alias(str(self.dtype.subtype)) - interval_type = IntervalType(subtype, self.closed) + interval_type = ArrowIntervalType(subtype, self.closed) storage_array = pa.StructArray.from_arrays( [ pa.array(self.left, type=subtype, from_pandas=True), @@ -1263,7 +1263,7 @@ def maybe_convert_platform_interval(values): LooseVersion(pyarrow.__version__) >= LooseVersion("0.14.1.dev") ): - class IntervalType(pyarrow.ExtensionType): + class ArrowIntervalType(pyarrow.ExtensionType): def __init__(self, subtype, closed): # attributes need to be set first before calling # super init (as that calls serialize) @@ -1290,18 +1290,21 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized): metadata = json.loads(serialized.decode()) subtype = pyarrow.type_for_alias(metadata["subtype"]) closed = metadata["closed"] - return IntervalType(subtype, closed) + return ArrowIntervalType(subtype, closed) def __eq__(self, other): if isinstance(other, pyarrow.BaseExtensionType): return ( type(self) == type(other) - and self.subtype == other.subtupe + and self.subtype == other.subtype and self.closed == other.closed ) else: return NotImplemented + def __hash__(self): + return hash((str(self), str(self.subtype), self.closed)) + # register the type with a dummy instance - _interval_type = IntervalType(pyarrow.int64(), "left") + _interval_type = ArrowIntervalType(pyarrow.int64(), "left") pyarrow.register_extension_type(_interval_type) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 50bf795f4109d..a33497d593eb6 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -353,10 +353,10 @@ def __arrow_array__(self, type=None): """ import pyarrow as pa - if type is not None and not isinstance(type, PeriodType): + if type is not None and not isinstance(type, ArrowPeriodType): raise TypeError("not supported") - period_type = PeriodType(self.freqstr) + period_type = ArrowPeriodType(self.freqstr) storage_array = pa.array(self._data, mask=self.isna(), type="int64") return pa.ExtensionArray.from_storage(period_type, storage_array) @@ -1117,7 +1117,7 @@ def _make_field_arrays(*fields): LooseVersion(pyarrow.__version__) >= LooseVersion("0.14.1.dev") ): - class PeriodType(pyarrow.ExtensionType): + class ArrowPeriodType(pyarrow.ExtensionType): def __init__(self, freq): # attributes need to be set first before calling # super init (as that calls serialize) @@ -1135,7 +1135,7 @@ def __arrow_ext_serialize__(self): @classmethod def __arrow_ext_deserialize__(cls, storage_type, serialized): metadata = json.loads(serialized.decode()) - return PeriodType(metadata["freq"]) + return ArrowPeriodType(metadata["freq"]) def __eq__(self, other): if isinstance(other, pyarrow.BaseExtensionType): @@ -1143,6 +1143,9 @@ def __eq__(self, other): else: return NotImplemented + def __hash__(self): + return hash((str(self), self.freq)) + # register the type with a dummy instance - _period_type = PeriodType("D") + _period_type = ArrowPeriodType("D") pyarrow.register_extension_type(_period_type) diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index 7c753ae4f413b..6471d5bd43736 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -114,6 +114,27 @@ def test_repr(): assert result == expected +@pytest.mark.skipif( + not _PYARROW_INSTALLED + or _PYARROW_INSTALLED + and LooseVersion(pyarrow.__version__) < LooseVersion("0.14.1.dev"), + reason="pyarrow >= 0.15.0 required", +) +def test_arrow_extension_type(): + import pyarrow as pa + from pandas.core.arrays.interval import ArrowIntervalType + + p1 = ArrowIntervalType(pa.int64(), "left") + p2 = ArrowIntervalType(pa.int64(), "left") + p3 = ArrowIntervalType(pa.int64(), "right") + + assert p1.closed == "left" + assert p1 == p2 + assert not p1 == p3 + assert hash(p1) == hash(p2) + assert not hash(p1) == hash(p3) + + @pytest.mark.skipif( not _PYARROW_INSTALLED or _PYARROW_INSTALLED @@ -122,12 +143,12 @@ def test_repr(): ) def test_arrow_array(): import pyarrow as pa - from pandas.core.arrays.interval import IntervalType + from pandas.core.arrays.interval import ArrowIntervalType intervals = pd.interval_range(1, 5, freq=1).array arr = pa.array(intervals) - assert isinstance(arr.type, IntervalType) + assert isinstance(arr.type, ArrowIntervalType) assert arr.type.closed == intervals.closed assert arr.type.subtype == pa.int64() diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index 9eb8456e55647..2c8e2e27afd3a 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -339,6 +339,26 @@ def test_min_max_empty(self, skipna): # Arrow interaction +@pytest.mark.skipif( + not _PYARROW_INSTALLED + or _PYARROW_INSTALLED + and LooseVersion(pyarrow.__version__) < LooseVersion("0.14.1.dev"), + reason="pyarrow >= 0.15.0 required", +) +def test_arrow_extension_type(): + from pandas.core.arrays.period import ArrowPeriodType + + p1 = ArrowPeriodType("D") + p2 = ArrowPeriodType("D") + p3 = ArrowPeriodType("M") + + assert p1.freq == "D" + assert p1 == p2 + assert not p1 == p3 + assert hash(p1) == hash(p2) + assert not hash(p1) == hash(p3) + + @pytest.mark.skipif( not _PYARROW_INSTALLED or _PYARROW_INSTALLED @@ -354,11 +374,11 @@ def test_min_max_empty(self, skipna): ) def test_arrow_array(data, freq): import pyarrow as pa - from pandas.core.arrays.period import PeriodType + from pandas.core.arrays.period import ArrowPeriodType periods = period_array(data, freq=freq) arr = pa.array(periods) - assert isinstance(arr.type, PeriodType) + assert isinstance(arr.type, ArrowPeriodType) assert arr.type.freq == freq expected = pa.array(periods.asi8, type="int64") assert arr.storage.equals(expected) From 85bf36c0f6a5a2287a4ead34bcff053d6adde46c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 5 Nov 2019 15:45:42 +0100 Subject: [PATCH 04/15] better validation of types + tests --- pandas/core/arrays/interval.py | 26 +++++++--- pandas/core/arrays/period.py | 20 ++++++-- pandas/tests/arrays/interval/test_interval.py | 41 ++++++++++------ pandas/tests/arrays/test_period.py | 48 +++++++++++++------ 4 files changed, 96 insertions(+), 39 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 6d69eebbfcbff..87a85bee2b7ff 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1051,9 +1051,6 @@ def __arrow_array__(self, type=None): """ import pyarrow as pa - if type is not None and not isinstance(type, ArrowIntervalType): - raise TypeError("not supported") - # TODO better conversion to arrow type, handle missing values subtype = pa.type_for_alias(str(self.dtype.subtype)) interval_type = ArrowIntervalType(subtype, self.closed) @@ -1064,6 +1061,25 @@ def __arrow_array__(self, type=None): ], names=["left", "right"], ) + + if type is not None: + if type.equals(interval_type.storage_type): + return storage_array + elif isinstance(type, ArrowIntervalType): + # ensure we have the same subtype and closed attributes + if not type.equals(interval_type): + raise TypeError( + "Not supported to convert IntervalArray to type with " + "different 'subtype' ({0} vs {1}) and 'closed' ({2} vs {3}) " + "attributes".format( + self.dtype.subtype, type.subtype, self.closed, type.closed + ) + ) + else: + raise TypeError( + "Not supported to convert IntervalArray to '{0}' type".format(type) + ) + return pa.ExtensionArray.from_storage(interval_type, storage_array) _interval_shared_docs[ @@ -1259,9 +1275,7 @@ def maybe_convert_platform_interval(values): return maybe_convert_platform(values) -if _PYARROW_INSTALLED and ( - LooseVersion(pyarrow.__version__) >= LooseVersion("0.14.1.dev") -): +if _PYARROW_INSTALLED and LooseVersion(pyarrow.__version__) >= LooseVersion("0.15"): class ArrowIntervalType(pyarrow.ExtensionType): def __init__(self, subtype, closed): diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index fcb6a9e7071c9..92503222e820e 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -353,8 +353,20 @@ def __arrow_array__(self, type=None): """ import pyarrow as pa - if type is not None and not isinstance(type, ArrowPeriodType): - raise TypeError("not supported") + if type is not None: + if pa.types.is_integer(type): + return pa.array(self._data, mask=self.isna(), type=type) + elif isinstance(type, ArrowPeriodType): + # ensure we have the same freq + if self.freqstr != type.freq: + raise TypeError( + "Not supported to convert PeriodArray to array with different" + " 'freq' ({0} vs {1})".format(self.freqstr, type.freq) + ) + else: + raise TypeError( + "Not supported to convert PeriodArray to '{0}' type".format(type) + ) period_type = ArrowPeriodType(self.freqstr) storage_array = pa.array(self._data, mask=self.isna(), type="int64") @@ -1115,9 +1127,7 @@ def _make_field_arrays(*fields): return arrays -if _PYARROW_INSTALLED and ( - LooseVersion(pyarrow.__version__) >= LooseVersion("0.14.1.dev") -): +if _PYARROW_INSTALLED and LooseVersion(pyarrow.__version__) >= LooseVersion("0.15"): class ArrowPeriodType(pyarrow.ExtensionType): def __init__(self, freq): diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index 6471d5bd43736..2934aba03119f 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -114,12 +114,19 @@ def test_repr(): assert result == expected -@pytest.mark.skipif( +# ---------------------------------------------------------------------------- +# Arrow interaction + + +pyarrow_skip = pytest.mark.skipif( not _PYARROW_INSTALLED or _PYARROW_INSTALLED - and LooseVersion(pyarrow.__version__) < LooseVersion("0.14.1.dev"), - reason="pyarrow >= 0.15.0 required", + and LooseVersion(pyarrow.__version__) < LooseVersion("0.15.1.dev"), + reason="pyarrow > 0.15 required", ) + + +@pyarrow_skip def test_arrow_extension_type(): import pyarrow as pa from pandas.core.arrays.interval import ArrowIntervalType @@ -135,24 +142,30 @@ def test_arrow_extension_type(): assert not hash(p1) == hash(p3) -@pytest.mark.skipif( - not _PYARROW_INSTALLED - or _PYARROW_INSTALLED - and LooseVersion(pyarrow.__version__) < LooseVersion("0.14.1.dev"), - reason="pyarrow >= 0.15.0 required", -) +@pyarrow_skip def test_arrow_array(): import pyarrow as pa from pandas.core.arrays.interval import ArrowIntervalType intervals = pd.interval_range(1, 5, freq=1).array - arr = pa.array(intervals) - assert isinstance(arr.type, ArrowIntervalType) - assert arr.type.closed == intervals.closed - assert arr.type.subtype == pa.int64() + result = pa.array(intervals) + assert isinstance(result.type, ArrowIntervalType) + assert result.type.closed == intervals.closed + assert result.type.subtype == pa.int64() + assert result.storage.field("left").equals(pa.array([1, 2, 3, 4], type="int64")) + assert result.storage.field("right").equals(pa.array([2, 3, 4, 5], type="int64")) - assert arr.storage.field("left").equals(pa.array([1, 2, 3, 4], type="int64")) + expected = pa.array([{"left": i, "right": i + 1} for i in range(1, 5)]) + assert result.storage.equals(expected) + # convert to its storage type + result = pa.array(intervals, type=expected.type) + assert result.equals(expected) + + # unsupported conversions with pytest.raises(TypeError): pa.array(intervals, type="float64") + + with pytest.raises(TypeError, match="different 'subtype'"): + pa.array(intervals, type=ArrowIntervalType(pa.float64(), "left")) diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index 2c8e2e27afd3a..781455bc38b76 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -338,13 +338,15 @@ def test_min_max_empty(self, skipna): # ---------------------------------------------------------------------------- # Arrow interaction - -@pytest.mark.skipif( +pyarrow_skip = pytest.mark.skipif( not _PYARROW_INSTALLED or _PYARROW_INSTALLED - and LooseVersion(pyarrow.__version__) < LooseVersion("0.14.1.dev"), - reason="pyarrow >= 0.15.0 required", + and LooseVersion(pyarrow.__version__) < LooseVersion("0.15.1.dev"), + reason="pyarrow > 0.15 required", ) + + +@pyarrow_skip def test_arrow_extension_type(): from pandas.core.arrays.period import ArrowPeriodType @@ -359,12 +361,7 @@ def test_arrow_extension_type(): assert not hash(p1) == hash(p3) -@pytest.mark.skipif( - not _PYARROW_INSTALLED - or _PYARROW_INSTALLED - and LooseVersion(pyarrow.__version__) < LooseVersion("0.14.1.dev"), - reason="pyarrow >= 0.15.0 required", -) +@pyarrow_skip @pytest.mark.parametrize( "data, freq", [ @@ -377,11 +374,34 @@ def test_arrow_array(data, freq): from pandas.core.arrays.period import ArrowPeriodType periods = period_array(data, freq=freq) - arr = pa.array(periods) - assert isinstance(arr.type, ArrowPeriodType) - assert arr.type.freq == freq + result = pa.array(periods) + assert isinstance(result.type, ArrowPeriodType) + assert result.type.freq == freq expected = pa.array(periods.asi8, type="int64") - assert arr.storage.equals(expected) + assert result.storage.equals(expected) + + # convert to its storage type + result = pa.array(periods, type=pa.int64()) + assert result.equals(expected) + # unsupported conversions with pytest.raises(TypeError): pa.array(periods, type="float64") + + with pytest.raises(TypeError, match="different 'freq'"): + pa.array(periods, type=ArrowPeriodType("T")) + + +@pyarrow_skip +def test_arrow_array_missing(): + import pyarrow as pa + from pandas.core.arrays.period import ArrowPeriodType + + arr = PeriodArray([1, 2, 3], freq="D") + arr[1] = pd.NaT + + result = pa.array(arr) + assert isinstance(result.type, ArrowPeriodType) + assert result.type.freq == "D" + expected = pa.array([1, None, 3], type="int64") + assert result.storage.equals(expected) From f325ff1c5c53ed1c7d14acae06d0c20e08cc72c9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 5 Nov 2019 16:30:33 +0100 Subject: [PATCH 05/15] add tests for missing values with IntervalArray --- pandas/tests/arrays/interval/test_interval.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index 2934aba03119f..2aa71804d04c2 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -169,3 +169,33 @@ def test_arrow_array(): with pytest.raises(TypeError, match="different 'subtype'"): pa.array(intervals, type=ArrowIntervalType(pa.float64(), "left")) + + +@pyarrow_skip +def test_arrow_array_missing(): + import pyarrow as pa + from pandas.core.arrays.interval import ArrowIntervalType + + arr = pd.arrays.IntervalArray.from_breaks([0, 1, 2, 3]) + arr[1] = None + + result = pa.array(arr) + assert isinstance(result.type, ArrowIntervalType) + assert result.type.closed == arr.closed + assert result.type.subtype == pa.float64() + + # fields have missing values (not NaN) + left = pa.array([0.0, None, 2.0], type="float64") + right = pa.array([1.0, None, 3.0], type="float64") + assert result.storage.field("left").equals(left) + assert result.storage.field("right").equals(right) + + # TODO implement setting the missing values bitmap on the array level + # # structarray itself also has missing values on the array level + # vals = [ + # {"left": 0.0, "right": 1.0}, + # {"left": None, "right": None}, + # {"left": 2.0, "right": 3.0}, + # ] + # expected = pa.StructArray.from_pandas(vals, mask=np.array([False, True, False])) + # assert result.storage.equals(expected) From 82589dde8d63cbdc8c9fad76c11fd318c67cb9f0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 5 Nov 2019 17:48:19 +0100 Subject: [PATCH 06/15] Add arrow -> pandas conversion + tests --- pandas/core/arrays/interval.py | 13 ++++- pandas/core/dtypes/dtypes.py | 49 +++++++++++++++++++ pandas/tests/arrays/interval/test_interval.py | 28 ++++++++++- pandas/tests/arrays/test_period.py | 21 ++++++++ 4 files changed, 108 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 87a85bee2b7ff..ba91d837e06f6 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1052,7 +1052,11 @@ def __arrow_array__(self, type=None): import pyarrow as pa # TODO better conversion to arrow type, handle missing values - subtype = pa.type_for_alias(str(self.dtype.subtype)) + subtype = str(self.dtype.subtype) + if subtype == "datetime64[ns]": + subtype = pyarrow.timestamp("ns") + else: + subtype = pyarrow.type_for_alias(subtype) interval_type = ArrowIntervalType(subtype, self.closed) storage_array = pa.StructArray.from_arrays( [ @@ -1282,7 +1286,12 @@ def __init__(self, subtype, closed): # attributes need to be set first before calling # super init (as that calls serialize) assert closed in _VALID_CLOSED - self._subtype = pyarrow.type_for_alias(str(subtype)) + # TODO proper conversion from pandas to pyarrow types + subtype = str(subtype) + if subtype == "datetime64[ns]": + self._subtype = pyarrow.timestamp("ns") + else: + self._subtype = pyarrow.type_for_alias(subtype) self._closed = closed storage_type = pyarrow.struct([("left", subtype), ("right", subtype)]) pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval") diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 7dca588e33839..1c936b3dc9e65 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -944,6 +944,36 @@ def construct_array_type(cls): return PeriodArray + def __from_arrow__(self, array): + """Construct PeriodArray from pyarrow Array/ChunkedArray.""" + import pyarrow + from pandas.core.arrays import PeriodArray + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + chunks = array.chunks + + results = [] + for arr in chunks: + buflist = arr.buffers() + data = np.frombuffer(buflist[-1], dtype="int64")[ + arr.offset : arr.offset + len(arr) + ] + bitmask = buflist[0] + if bitmask is not None: + mask = pyarrow.BooleanArray.from_buffers( + pyarrow.bool_(), len(arr), [None, bitmask] + ) + mask = np.asarray(mask) + else: + mask = np.ones(len(arr), dtype=bool) + parr = PeriodArray(data.copy(), freq=self.freq, copy=False) + parr[~mask] = NaT + results.append(parr) + + return PeriodArray._concat_same_type(results) + @register_extension_dtype class IntervalDtype(PandasExtensionDtype): @@ -1115,3 +1145,22 @@ def is_dtype(cls, dtype): else: return False return super().is_dtype(dtype) + + def __from_arrow__(self, array): + """Construct IntervalArray from pyarrow Array/ChunkedArray.""" + import pyarrow + from pandas.core.arrays import IntervalArray + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + chunks = array.chunks + + results = [] + for arr in chunks: + left = np.asarray(arr.storage.field("left"), dtype=self.subtype) + right = np.asarray(arr.storage.field("right"), dtype=self.subtype) + iarr = IntervalArray.from_arrays(left, right, closed=array.type.closed) + results.append(iarr) + + return IntervalArray._concat_same_type(results) diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index 2aa71804d04c2..f7ca87ea2b3eb 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -176,7 +176,7 @@ def test_arrow_array_missing(): import pyarrow as pa from pandas.core.arrays.interval import ArrowIntervalType - arr = pd.arrays.IntervalArray.from_breaks([0, 1, 2, 3]) + arr = IntervalArray.from_breaks([0, 1, 2, 3]) arr[1] = None result = pa.array(arr) @@ -199,3 +199,29 @@ def test_arrow_array_missing(): # ] # expected = pa.StructArray.from_pandas(vals, mask=np.array([False, True, False])) # assert result.storage.equals(expected) + + +@pyarrow_skip +@pytest.mark.parametrize( + "breaks", + [[0, 1, 2, 3], pd.date_range("2017", periods=4, freq="D")], + ids=["int", "datetime64[ns]"], +) +def test_arrow_table_roundtrip(breaks): + import pyarrow as pa + from pandas.core.arrays.interval import ArrowIntervalType + + arr = IntervalArray.from_breaks(breaks) + arr[1] = None + df = pd.DataFrame({"a": arr}) + + table = pa.table(df) + assert isinstance(table.field("a").type, ArrowIntervalType) + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.IntervalDtype) + tm.assert_frame_equal(result, df) + + table2 = pa.concat_tables([table, table]) + result = table2.to_pandas() + expected = pd.concat([df, df], ignore_index=True) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index 781455bc38b76..eaf4b31198b5c 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -405,3 +405,24 @@ def test_arrow_array_missing(): assert result.type.freq == "D" expected = pa.array([1, None, 3], type="int64") assert result.storage.equals(expected) + + +@pyarrow_skip +def test_arrow_table_roundtrip(): + import pyarrow as pa + from pandas.core.arrays.period import ArrowPeriodType + + arr = PeriodArray([1, 2, 3], freq="D") + arr[1] = pd.NaT + df = pd.DataFrame({"a": arr}) + + table = pa.table(df) + assert isinstance(table.field("a").type, ArrowPeriodType) + result = table.to_pandas() + assert isinstance(result["a"].dtype, PeriodDtype) + tm.assert_frame_equal(result, df) + + table2 = pa.concat_tables([table, table]) + result = table2.to_pandas() + expected = pd.concat([df, df], ignore_index=True) + tm.assert_frame_equal(result, expected) From 70e7023364f8eda4807d58c20a7767556c1f9c76 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 8 Nov 2019 13:55:59 +0100 Subject: [PATCH 07/15] fix interval subtype and missing value handling --- pandas/core/arrays/interval.py | 23 ++++++++++++++----- pandas/tests/arrays/interval/test_interval.py | 17 +++++++------- 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 07b62fb266c67..b34d8e4aeb2e7 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1051,12 +1051,13 @@ def __arrow_array__(self, type=None): """ import pyarrow as pa - # TODO better conversion to arrow type, handle missing values - subtype = str(self.dtype.subtype) - if subtype == "datetime64[ns]": - subtype = pyarrow.timestamp("ns") - else: - subtype = pyarrow.type_for_alias(subtype) + try: + subtype = pa.from_numpy_dtype(self.dtype.subtype) + except TypeError: + raise TypeError( + "Conversion to arrow with subtype '{}' " + "is not supported".format(self.dtype.subtype) + ) interval_type = ArrowIntervalType(subtype, self.closed) storage_array = pa.StructArray.from_arrays( [ @@ -1065,6 +1066,16 @@ def __arrow_array__(self, type=None): ], names=["left", "right"], ) + mask = self.isna() + if mask.any(): + # if there are missing values, set validity bitmap also on the array level + null_bitmap = pa.array(~mask).buffers()[1] + storage_array = pa.StructArray.from_buffers( + storage_array.type, + len(storage_array), + [null_bitmap], + children=[storage_array.field(0), storage_array.field(1)], + ) if type is not None: if type.equals(interval_type.storage_type): diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index f7ca87ea2b3eb..98e674f7d08bd 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -190,15 +190,14 @@ def test_arrow_array_missing(): assert result.storage.field("left").equals(left) assert result.storage.field("right").equals(right) - # TODO implement setting the missing values bitmap on the array level - # # structarray itself also has missing values on the array level - # vals = [ - # {"left": 0.0, "right": 1.0}, - # {"left": None, "right": None}, - # {"left": 2.0, "right": 3.0}, - # ] - # expected = pa.StructArray.from_pandas(vals, mask=np.array([False, True, False])) - # assert result.storage.equals(expected) + # structarray itself also has missing values on the array level + vals = [ + {"left": 0.0, "right": 1.0}, + {"left": None, "right": None}, + {"left": 2.0, "right": 3.0}, + ] + expected = pa.StructArray.from_pandas(vals, mask=np.array([False, True, False])) + assert result.storage.equals(expected) @pyarrow_skip From 6587bd2b1e83f4debbca9af7cebacdc6b30c96a4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 21 Nov 2019 15:19:22 +0100 Subject: [PATCH 08/15] use skip_if_no decorator --- pandas/tests/arrays/interval/test_interval.py | 18 +++--------------- pandas/tests/arrays/test_period.py | 18 ++---------------- 2 files changed, 5 insertions(+), 31 deletions(-) diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index 98e674f7d08bd..61c85a79c4c95 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -1,8 +1,8 @@ -from distutils.version import LooseVersion - import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Index, @@ -16,13 +16,6 @@ from pandas.core.arrays import IntervalArray import pandas.util.testing as tm -try: - import pyarrow - - _PYARROW_INSTALLED = True -except ImportError: - _PYARROW_INSTALLED = False - @pytest.fixture( params=[ @@ -118,12 +111,7 @@ def test_repr(): # Arrow interaction -pyarrow_skip = pytest.mark.skipif( - not _PYARROW_INSTALLED - or _PYARROW_INSTALLED - and LooseVersion(pyarrow.__version__) < LooseVersion("0.15.1.dev"), - reason="pyarrow > 0.15 required", -) +pyarrow_skip = td.skip_if_no("pyarrow", min_version="0.15.1.dev") @pyarrow_skip diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index eaf4b31198b5c..353298ac29ec6 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -1,10 +1,9 @@ -from distutils.version import LooseVersion - import numpy as np import pytest from pandas._libs.tslibs import iNaT from pandas._libs.tslibs.period import IncompatibleFrequency +import pandas.util._test_decorators as td from pandas.core.dtypes.dtypes import PeriodDtype, registry @@ -12,14 +11,6 @@ from pandas.core.arrays import PeriodArray, period_array import pandas.util.testing as tm -try: - import pyarrow - - _PYARROW_INSTALLED = True -except ImportError: - _PYARROW_INSTALLED = False - - # ---------------------------------------------------------------------------- # Dtype @@ -338,12 +329,7 @@ def test_min_max_empty(self, skipna): # ---------------------------------------------------------------------------- # Arrow interaction -pyarrow_skip = pytest.mark.skipif( - not _PYARROW_INSTALLED - or _PYARROW_INSTALLED - and LooseVersion(pyarrow.__version__) < LooseVersion("0.15.1.dev"), - reason="pyarrow > 0.15 required", -) +pyarrow_skip = pyarrow_skip = td.skip_if_no("pyarrow", min_version="0.15.1.dev") @pyarrow_skip From 5303bae10372bb2f9a9ba31c9d17c99cdcb92a51 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 21 Nov 2019 15:53:36 +0100 Subject: [PATCH 09/15] add parquet tests --- doc/source/user_guide/io.rst | 4 ++-- doc/source/whatsnew/v1.0.0.rst | 4 ++-- pandas/tests/io/test_parquet.py | 31 ++++++++++++++++++++++++++----- 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index fa47a5944f7bf..d74d44035d0c8 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -4714,8 +4714,8 @@ Several caveats. * Index level names, if specified, must be strings. * In the ``pyarrow`` engine, categorical dtypes for non-string types can be serialized to parquet, but will de-serialize as their primitive dtype. * The ``pyarrow`` engine preserves the ``ordered`` flag of categorical dtypes with string types. ``fastparquet`` does not preserve the ``ordered`` flag. -* Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message - on an attempt at serialization. +* Non supported types include ``Interval`` and actual Python object types. These will raise a helpful error message + on an attempt at serialization. ``Period`` type is supported with pyarrow >= 1.0.0. * The ``pyarrow`` engine preserves extension data types such as the nullable integer and string data type (requiring pyarrow >= 1.0.0, and requiring the extension type to implement the needed protocols, see the :ref:`extension types documentation `). diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 54640ff576338..0948b3d43784e 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -122,9 +122,9 @@ Other enhancements - Added ``encoding`` argument to :meth:`DataFrame.to_string` for non-ascii text (:issue:`28766`) - Added ``encoding`` argument to :func:`DataFrame.to_html` for non-ascii text (:issue:`28663`) - :meth:`Styler.background_gradient` now accepts ``vmin`` and ``vmax`` arguments (:issue:`12145`) -- Roundtripping DataFrames with nullable integer or string data types to parquet +- Roundtripping DataFrames with nullable integer, string and period data types to parquet (:meth:`~DataFrame.to_parquet` / :func:`read_parquet`) using the `'pyarrow'` engine - now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`). + now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`, :issue:`28371`). Build Changes ^^^^^^^^^^^^^ diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index bcbbee3b86769..e6452df80c82a 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -443,11 +443,12 @@ def test_duplicate_columns(self, pa): self.check_error_on_write(df, pa, ValueError) def test_unsupported(self, pa): - # period - df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)}) - # pyarrow 0.11 raises ArrowTypeError - # older pyarrows raise ArrowInvalid - self.check_error_on_write(df, pa, Exception) + if LooseVersion(pyarrow.__version__) < LooseVersion("0.15.1.dev"): + # period - will be supported using an extension type with pyarrow 1.0 + df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)}) + # pyarrow 0.11 raises ArrowTypeError + # older pyarrows raise ArrowInvalid + self.check_error_on_write(df, pa, Exception) # timedelta df = pd.DataFrame({"a": pd.timedelta_range("1 day", periods=3)}) @@ -529,6 +530,26 @@ def test_additional_extension_arrays(self, pa): expected = df.assign(a=df.a.astype("float64")) check_round_trip(df, pa, expected=expected) + @td.skip_if_no("pyarrow", min_version="0.15.0") + def test_additional_extension_types(self, pa): + # test additional ExtensionArrays that are supported through the + # __arrow_array__ protocol + by defining a custom ExtensionType + df = pd.DataFrame( + { + # Arrow does not yet support struct in writing to Parquet (ARROW-1644) + # "c": pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 2), (3, 4)]), + "d": pd.period_range("2012-01-01", periods=3, freq="D"), + } + ) + if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.1.dev"): + check_round_trip(df, pa) + else: + # writing works, reading not yet (on pyarrow 0.15) + with tm.ensure_clean() as path: + df.to_parquet(path, engine=pa) + with pytest.raises(Exception): + pd.read_parquet(path, engine=pa) + class TestParquetFastParquet(Base): @td.skip_if_no("fastparquet", min_version="0.2.1") From a97808ce410f546f7799ae19ee2d56fecea10f85 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 21 Nov 2019 15:59:32 +0100 Subject: [PATCH 10/15] clean-up type conversion --- pandas/core/arrays/interval.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 4bd673dcf349d..6cc1ab5a613f2 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1297,13 +1297,11 @@ def __init__(self, subtype, closed): # attributes need to be set first before calling # super init (as that calls serialize) assert closed in _VALID_CLOSED - # TODO proper conversion from pandas to pyarrow types - subtype = str(subtype) - if subtype == "datetime64[ns]": - self._subtype = pyarrow.timestamp("ns") - else: - self._subtype = pyarrow.type_for_alias(subtype) self._closed = closed + if not isinstance(subtype, pyarrow.DataType): + subtype = pyarrow.type_for_alias(str(subtype)) + self._subtype = subtype + storage_type = pyarrow.struct([("left", subtype), ("right", subtype)]) pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval") From e9a032dad526053b2813b85427664b6d51553bc7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 10 Dec 2019 14:59:24 +0100 Subject: [PATCH 11/15] period test only for pyarrow 0.15dev (in 0.15 .values was used which does not use the EA) --- pandas/tests/io/test_parquet.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index b1bb61c0ce275..4081b4412998a 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -531,7 +531,7 @@ def test_additional_extension_arrays(self, pa): expected = df.assign(a=df.a.astype("float64")) check_round_trip(df, pa, expected=expected) - @td.skip_if_no("pyarrow", min_version="0.15.0") + @td.skip_if_no("pyarrow", min_version="0.15.1.dev") def test_additional_extension_types(self, pa): # test additional ExtensionArrays that are supported through the # __arrow_array__ protocol + by defining a custom ExtensionType @@ -542,14 +542,7 @@ def test_additional_extension_types(self, pa): "d": pd.period_range("2012-01-01", periods=3, freq="D"), } ) - if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.1.dev"): - check_round_trip(df, pa) - else: - # writing works, reading not yet (on pyarrow 0.15) - with tm.ensure_clean() as path: - df.to_parquet(path, engine=pa) - with pytest.raises(Exception): - pd.read_parquet(path, engine=pa) + check_round_trip(df, pa) class TestParquetFastParquet(Base): From 1b6f21e58b2c94a021b90814c20b24eb35c7f17d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 6 Jan 2020 11:35:41 +0100 Subject: [PATCH 12/15] move common things to _arrow_utils --- pandas/core/arrays/_arrow_utils.py | 46 ++++++++++++++++++++++++++++++ pandas/core/arrays/interval.py | 29 +++++++------------ pandas/core/arrays/period.py | 22 ++++++-------- pandas/core/dtypes/dtypes.py | 14 ++------- 4 files changed, 67 insertions(+), 44 deletions(-) create mode 100644 pandas/core/arrays/_arrow_utils.py diff --git a/pandas/core/arrays/_arrow_utils.py b/pandas/core/arrays/_arrow_utils.py new file mode 100644 index 0000000000000..1c7c47c474337 --- /dev/null +++ b/pandas/core/arrays/_arrow_utils.py @@ -0,0 +1,46 @@ +from distutils.version import LooseVersion + +import numpy as np + +try: + import pyarrow + + _PYARROW_INSTALLED = True +except ImportError: + _PYARROW_INSTALLED = False + pyarrow = None + + +if _PYARROW_INSTALLED: + _pyarrow_version_ge_015 = LooseVersion(pyarrow.__version__) >= LooseVersion("0.15") +else: + _pyarrow_version_ge_015 = False + + +def pyarrow_array_to_numpy_and_mask(arr, dtype): + """ + Convert a primitive pyarrow.Array to a numpy array and boolean mask based + on the buffers of the Array. + + Parameters + ---------- + arr : pyarrow.Array + dtype : numpy.dtype + + Returns + ------- + (data, mask) + Tuple of two numpy arrays with the raw data (with specified dtype) and + a boolean mask (validity mask, so False means missing) + """ + buflist = arr.buffers() + data = np.frombuffer(buflist[-1], dtype=dtype)[arr.offset : arr.offset + len(arr)] + bitmask = buflist[0] + if bitmask is not None: + mask = pyarrow.BooleanArray.from_buffers( + pyarrow.bool_(), len(arr), [None, bitmask] + ) + mask = np.asarray(mask) + else: + mask = np.ones(len(arr), dtype=bool) + return data, mask diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 19b1c813d96a7..edd9d5bc2761b 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1,4 +1,3 @@ -from distutils.version import LooseVersion import json from operator import le, lt import textwrap @@ -38,20 +37,13 @@ from pandas.core.dtypes.missing import isna, notna from pandas.core.algorithms import take, value_counts +from pandas.core.arrays._arrow_utils import _PYARROW_INSTALLED, _pyarrow_version_ge_015 from pandas.core.arrays.base import ExtensionArray, _extension_array_shared_docs from pandas.core.arrays.categorical import Categorical import pandas.core.common as com from pandas.core.construction import array from pandas.core.indexes.base import ensure_index -try: - import pyarrow - - _PYARROW_INSTALLED = True -except ImportError: - _PYARROW_INSTALLED = False - - _VALID_CLOSED = {"left", "right", "both", "neither"} _interval_shared_docs = {} @@ -1095,28 +1087,28 @@ def __arrow_array__(self, type=None): """ Convert myself into a pyarrow Array. """ - import pyarrow as pa + import pyarrow try: - subtype = pa.from_numpy_dtype(self.dtype.subtype) + subtype = pyarrow.from_numpy_dtype(self.dtype.subtype) except TypeError: raise TypeError( "Conversion to arrow with subtype '{}' " "is not supported".format(self.dtype.subtype) ) interval_type = ArrowIntervalType(subtype, self.closed) - storage_array = pa.StructArray.from_arrays( + storage_array = pyarrow.StructArray.from_arrays( [ - pa.array(self.left, type=subtype, from_pandas=True), - pa.array(self.right, type=subtype, from_pandas=True), + pyarrow.array(self.left, type=subtype, from_pandas=True), + pyarrow.array(self.right, type=subtype, from_pandas=True), ], names=["left", "right"], ) mask = self.isna() if mask.any(): # if there are missing values, set validity bitmap also on the array level - null_bitmap = pa.array(~mask).buffers()[1] - storage_array = pa.StructArray.from_buffers( + null_bitmap = pyarrow.array(~mask).buffers()[1] + storage_array = pyarrow.StructArray.from_buffers( storage_array.type, len(storage_array), [null_bitmap], @@ -1141,7 +1133,7 @@ def __arrow_array__(self, type=None): "Not supported to convert IntervalArray to '{0}' type".format(type) ) - return pa.ExtensionArray.from_storage(interval_type, storage_array) + return pyarrow.ExtensionArray.from_storage(interval_type, storage_array) _interval_shared_docs[ "to_tuples" @@ -1336,7 +1328,8 @@ def maybe_convert_platform_interval(values): return maybe_convert_platform(values) -if _PYARROW_INSTALLED and LooseVersion(pyarrow.__version__) >= LooseVersion("0.15"): +if _PYARROW_INSTALLED and _pyarrow_version_ge_015: + import pyarrow class ArrowIntervalType(pyarrow.ExtensionType): def __init__(self, subtype, closed): diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 73d85d5635f15..6f2c5c6844f50 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -1,5 +1,4 @@ from datetime import timedelta -from distutils.version import LooseVersion import json import operator from typing import Any, Callable, List, Optional, Sequence, Union @@ -47,6 +46,7 @@ from pandas.core import ops import pandas.core.algorithms as algos from pandas.core.arrays import datetimelike as dtl +from pandas.core.arrays._arrow_utils import _PYARROW_INSTALLED, _pyarrow_version_ge_015 import pandas.core.common as com from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.ops.invalid import invalid_comparison @@ -54,13 +54,6 @@ from pandas.tseries import frequencies from pandas.tseries.offsets import DateOffset, Tick, _delta_to_tick -try: - import pyarrow - - _PYARROW_INSTALLED = True -except ImportError: - _PYARROW_INSTALLED = False - def _field_accessor(name, alias, docstring=None): def f(self): @@ -376,11 +369,11 @@ def __arrow_array__(self, type=None): """ Convert myself into a pyarrow Array. """ - import pyarrow as pa + import pyarrow if type is not None: - if pa.types.is_integer(type): - return pa.array(self._data, mask=self.isna(), type=type) + if pyarrow.types.is_integer(type): + return pyarrow.array(self._data, mask=self.isna(), type=type) elif isinstance(type, ArrowPeriodType): # ensure we have the same freq if self.freqstr != type.freq: @@ -394,8 +387,8 @@ def __arrow_array__(self, type=None): ) period_type = ArrowPeriodType(self.freqstr) - storage_array = pa.array(self._data, mask=self.isna(), type="int64") - return pa.ExtensionArray.from_storage(period_type, storage_array) + storage_array = pyarrow.array(self._data, mask=self.isna(), type="int64") + return pyarrow.ExtensionArray.from_storage(period_type, storage_array) # -------------------------------------------------------------------- # Vectorized analogues of Period properties @@ -1157,7 +1150,8 @@ def _make_field_arrays(*fields): return arrays -if _PYARROW_INSTALLED and LooseVersion(pyarrow.__version__) >= LooseVersion("0.15"): +if _PYARROW_INSTALLED and _pyarrow_version_ge_015: + import pyarrow class ArrowPeriodType(pyarrow.ExtensionType): def __init__(self, freq): diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 22f7ed49f86b7..1df7d9028171d 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -954,6 +954,7 @@ def __from_arrow__(self, array): """Construct PeriodArray from pyarrow Array/ChunkedArray.""" import pyarrow from pandas.core.arrays import PeriodArray + from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask if isinstance(array, pyarrow.Array): chunks = [array] @@ -962,18 +963,7 @@ def __from_arrow__(self, array): results = [] for arr in chunks: - buflist = arr.buffers() - data = np.frombuffer(buflist[-1], dtype="int64")[ - arr.offset : arr.offset + len(arr) - ] - bitmask = buflist[0] - if bitmask is not None: - mask = pyarrow.BooleanArray.from_buffers( - pyarrow.bool_(), len(arr), [None, bitmask] - ) - mask = np.asarray(mask) - else: - mask = np.ones(len(arr), dtype=bool) + data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype="int64") parr = PeriodArray(data.copy(), freq=self.freq, copy=False) parr[~mask] = NaT results.append(parr) From 4156718b79e2707200d2b11daeb9fc148ed60e47 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 8 Jan 2020 11:31:43 +0100 Subject: [PATCH 13/15] use commong function in IntDtype from_arrow --- pandas/core/arrays/_arrow_utils.py | 2 +- pandas/core/arrays/integer.py | 14 ++------------ 2 files changed, 3 insertions(+), 13 deletions(-) diff --git a/pandas/core/arrays/_arrow_utils.py b/pandas/core/arrays/_arrow_utils.py index 1c7c47c474337..5f9004b5d7e0b 100644 --- a/pandas/core/arrays/_arrow_utils.py +++ b/pandas/core/arrays/_arrow_utils.py @@ -34,7 +34,7 @@ def pyarrow_array_to_numpy_and_mask(arr, dtype): a boolean mask (validity mask, so False means missing) """ buflist = arr.buffers() - data = np.frombuffer(buflist[-1], dtype=dtype)[arr.offset : arr.offset + len(arr)] + data = np.frombuffer(buflist[1], dtype=dtype)[arr.offset : arr.offset + len(arr)] bitmask = buflist[0] if bitmask is not None: mask = pyarrow.BooleanArray.from_buffers( diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index d63692c5ba972..0ef72c6e80200 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -28,6 +28,7 @@ from pandas.core import nanops, ops from pandas.core.algorithms import take from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin +from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask import pandas.core.common as com from pandas.core.indexers import check_bool_array_indexer from pandas.core.ops import invalid_comparison @@ -103,18 +104,7 @@ def __from_arrow__(self, array): results = [] for arr in chunks: - buflist = arr.buffers() - data = np.frombuffer(buflist[1], dtype=self.type)[ - arr.offset : arr.offset + len(arr) - ] - bitmask = buflist[0] - if bitmask is not None: - mask = pyarrow.BooleanArray.from_buffers( - pyarrow.bool_(), len(arr), [None, bitmask] - ) - mask = np.asarray(mask) - else: - mask = np.ones(len(arr), dtype=bool) + data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.type) int_arr = IntegerArray(data.copy(), ~mask, copy=False) results.append(int_arr) From 92a1edec2c9b42b4c8731cac1b3a15d244013197 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 8 Jan 2020 13:36:30 +0100 Subject: [PATCH 14/15] lazy import for now --- pandas/core/arrays/_arrow_utils.py | 102 +++++++++++++++--- pandas/core/arrays/integer.py | 2 +- pandas/core/arrays/interval.py | 56 +--------- pandas/core/arrays/period.py | 40 +------ pandas/io/parquet.py | 3 + pandas/tests/arrays/interval/test_interval.py | 8 +- pandas/tests/arrays/test_period.py | 8 +- 7 files changed, 104 insertions(+), 115 deletions(-) diff --git a/pandas/core/arrays/_arrow_utils.py b/pandas/core/arrays/_arrow_utils.py index 5f9004b5d7e0b..e0d33bebeb421 100644 --- a/pandas/core/arrays/_arrow_utils.py +++ b/pandas/core/arrays/_arrow_utils.py @@ -1,20 +1,12 @@ from distutils.version import LooseVersion +import json import numpy as np +import pyarrow -try: - import pyarrow +from pandas.core.arrays.interval import _VALID_CLOSED - _PYARROW_INSTALLED = True -except ImportError: - _PYARROW_INSTALLED = False - pyarrow = None - - -if _PYARROW_INSTALLED: - _pyarrow_version_ge_015 = LooseVersion(pyarrow.__version__) >= LooseVersion("0.15") -else: - _pyarrow_version_ge_015 = False +_pyarrow_version_ge_015 = LooseVersion(pyarrow.__version__) >= LooseVersion("0.15") def pyarrow_array_to_numpy_and_mask(arr, dtype): @@ -44,3 +36,89 @@ def pyarrow_array_to_numpy_and_mask(arr, dtype): else: mask = np.ones(len(arr), dtype=bool) return data, mask + + +if _pyarrow_version_ge_015: + # the pyarrow extension types are only available for pyarrow 0.15+ + + class ArrowPeriodType(pyarrow.ExtensionType): + def __init__(self, freq): + # attributes need to be set first before calling + # super init (as that calls serialize) + self._freq = freq + pyarrow.ExtensionType.__init__(self, pyarrow.int64(), "pandas.period") + + @property + def freq(self): + return self._freq + + def __arrow_ext_serialize__(self): + metadata = {"freq": self.freq} + return json.dumps(metadata).encode() + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + metadata = json.loads(serialized.decode()) + return ArrowPeriodType(metadata["freq"]) + + def __eq__(self, other): + if isinstance(other, pyarrow.BaseExtensionType): + return type(self) == type(other) and self.freq == other.freq + else: + return NotImplemented + + def __hash__(self): + return hash((str(self), self.freq)) + + # register the type with a dummy instance + _period_type = ArrowPeriodType("D") + pyarrow.register_extension_type(_period_type) + + class ArrowIntervalType(pyarrow.ExtensionType): + def __init__(self, subtype, closed): + # attributes need to be set first before calling + # super init (as that calls serialize) + assert closed in _VALID_CLOSED + self._closed = closed + if not isinstance(subtype, pyarrow.DataType): + subtype = pyarrow.type_for_alias(str(subtype)) + self._subtype = subtype + + storage_type = pyarrow.struct([("left", subtype), ("right", subtype)]) + pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval") + + @property + def subtype(self): + return self._subtype + + @property + def closed(self): + return self._closed + + def __arrow_ext_serialize__(self): + metadata = {"subtype": str(self.subtype), "closed": self.closed} + return json.dumps(metadata).encode() + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + metadata = json.loads(serialized.decode()) + subtype = pyarrow.type_for_alias(metadata["subtype"]) + closed = metadata["closed"] + return ArrowIntervalType(subtype, closed) + + def __eq__(self, other): + if isinstance(other, pyarrow.BaseExtensionType): + return ( + type(self) == type(other) + and self.subtype == other.subtype + and self.closed == other.closed + ) + else: + return NotImplemented + + def __hash__(self): + return hash((str(self), str(self.subtype), self.closed)) + + # register the type with a dummy instance + _interval_type = ArrowIntervalType(pyarrow.int64(), "left") + pyarrow.register_extension_type(_interval_type) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 0ef72c6e80200..5b541ee561688 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -28,7 +28,6 @@ from pandas.core import nanops, ops from pandas.core.algorithms import take from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin -from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask import pandas.core.common as com from pandas.core.indexers import check_bool_array_indexer from pandas.core.ops import invalid_comparison @@ -95,6 +94,7 @@ def construct_array_type(cls): def __from_arrow__(self, array): """Construct IntegerArray from passed pyarrow Array/ChunkedArray""" import pyarrow + from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask if isinstance(array, pyarrow.Array): chunks = [array] diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index edd9d5bc2761b..e6cfca74048e7 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1,4 +1,3 @@ -import json from operator import le, lt import textwrap @@ -37,7 +36,6 @@ from pandas.core.dtypes.missing import isna, notna from pandas.core.algorithms import take, value_counts -from pandas.core.arrays._arrow_utils import _PYARROW_INSTALLED, _pyarrow_version_ge_015 from pandas.core.arrays.base import ExtensionArray, _extension_array_shared_docs from pandas.core.arrays.categorical import Categorical import pandas.core.common as com @@ -1088,6 +1086,7 @@ def __arrow_array__(self, type=None): Convert myself into a pyarrow Array. """ import pyarrow + from pandas.core.arrays._arrow_utils import ArrowIntervalType try: subtype = pyarrow.from_numpy_dtype(self.dtype.subtype) @@ -1326,56 +1325,3 @@ def maybe_convert_platform_interval(values): values = np.asarray(values) return maybe_convert_platform(values) - - -if _PYARROW_INSTALLED and _pyarrow_version_ge_015: - import pyarrow - - class ArrowIntervalType(pyarrow.ExtensionType): - def __init__(self, subtype, closed): - # attributes need to be set first before calling - # super init (as that calls serialize) - assert closed in _VALID_CLOSED - self._closed = closed - if not isinstance(subtype, pyarrow.DataType): - subtype = pyarrow.type_for_alias(str(subtype)) - self._subtype = subtype - - storage_type = pyarrow.struct([("left", subtype), ("right", subtype)]) - pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval") - - @property - def subtype(self): - return self._subtype - - @property - def closed(self): - return self._closed - - def __arrow_ext_serialize__(self): - metadata = {"subtype": str(self.subtype), "closed": self.closed} - return json.dumps(metadata).encode() - - @classmethod - def __arrow_ext_deserialize__(cls, storage_type, serialized): - metadata = json.loads(serialized.decode()) - subtype = pyarrow.type_for_alias(metadata["subtype"]) - closed = metadata["closed"] - return ArrowIntervalType(subtype, closed) - - def __eq__(self, other): - if isinstance(other, pyarrow.BaseExtensionType): - return ( - type(self) == type(other) - and self.subtype == other.subtype - and self.closed == other.closed - ) - else: - return NotImplemented - - def __hash__(self): - return hash((str(self), str(self.subtype), self.closed)) - - # register the type with a dummy instance - _interval_type = ArrowIntervalType(pyarrow.int64(), "left") - pyarrow.register_extension_type(_interval_type) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index cf0b4dd621822..b5fa3206055cd 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -1,5 +1,4 @@ from datetime import timedelta -import json import operator from typing import Any, Callable, List, Optional, Sequence, Union @@ -42,7 +41,6 @@ import pandas.core.algorithms as algos from pandas.core.arrays import datetimelike as dtl -from pandas.core.arrays._arrow_utils import _PYARROW_INSTALLED, _pyarrow_version_ge_015 import pandas.core.common as com from pandas.tseries import frequencies @@ -290,6 +288,7 @@ def __arrow_array__(self, type=None): Convert myself into a pyarrow Array. """ import pyarrow + from pandas.core.arrays._arrow_utils import ArrowPeriodType if type is not None: if pyarrow.types.is_integer(type): @@ -1053,40 +1052,3 @@ def _make_field_arrays(*fields): ] return arrays - - -if _PYARROW_INSTALLED and _pyarrow_version_ge_015: - import pyarrow - - class ArrowPeriodType(pyarrow.ExtensionType): - def __init__(self, freq): - # attributes need to be set first before calling - # super init (as that calls serialize) - self._freq = freq - pyarrow.ExtensionType.__init__(self, pyarrow.int64(), "pandas.period") - - @property - def freq(self): - return self._freq - - def __arrow_ext_serialize__(self): - metadata = {"freq": self.freq} - return json.dumps(metadata).encode() - - @classmethod - def __arrow_ext_deserialize__(cls, storage_type, serialized): - metadata = json.loads(serialized.decode()) - return ArrowPeriodType(metadata["freq"]) - - def __eq__(self, other): - if isinstance(other, pyarrow.BaseExtensionType): - return type(self) == type(other) and self.freq == other.freq - else: - return NotImplemented - - def __hash__(self): - return hash((str(self), self.freq)) - - # register the type with a dummy instance - _period_type = ArrowPeriodType("D") - pyarrow.register_extension_type(_period_type) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index f68347f042086..3a686a1a3b122 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -76,6 +76,9 @@ def __init__(self): ) import pyarrow.parquet + # import utils to register the pyarrow extension types + import pandas.core.arrays._arrow_utils # noqa + self.api = pyarrow def write( diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index 9f28972140137..e046d87780bb4 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -117,7 +117,7 @@ def test_repr(): @pyarrow_skip def test_arrow_extension_type(): import pyarrow as pa - from pandas.core.arrays.interval import ArrowIntervalType + from pandas.core.arrays._arrow_utils import ArrowIntervalType p1 = ArrowIntervalType(pa.int64(), "left") p2 = ArrowIntervalType(pa.int64(), "left") @@ -133,7 +133,7 @@ def test_arrow_extension_type(): @pyarrow_skip def test_arrow_array(): import pyarrow as pa - from pandas.core.arrays.interval import ArrowIntervalType + from pandas.core.arrays._arrow_utils import ArrowIntervalType intervals = pd.interval_range(1, 5, freq=1).array @@ -162,7 +162,7 @@ def test_arrow_array(): @pyarrow_skip def test_arrow_array_missing(): import pyarrow as pa - from pandas.core.arrays.interval import ArrowIntervalType + from pandas.core.arrays._arrow_utils import ArrowIntervalType arr = IntervalArray.from_breaks([0, 1, 2, 3]) arr[1] = None @@ -196,7 +196,7 @@ def test_arrow_array_missing(): ) def test_arrow_table_roundtrip(breaks): import pyarrow as pa - from pandas.core.arrays.interval import ArrowIntervalType + from pandas.core.arrays._arrow_utils import ArrowIntervalType arr = IntervalArray.from_breaks(breaks) arr[1] = None diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index 10099ebd5228a..1f4351c7e20ee 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -334,7 +334,7 @@ def test_min_max_empty(self, skipna): @pyarrow_skip def test_arrow_extension_type(): - from pandas.core.arrays.period import ArrowPeriodType + from pandas.core.arrays._arrow_utils import ArrowPeriodType p1 = ArrowPeriodType("D") p2 = ArrowPeriodType("D") @@ -357,7 +357,7 @@ def test_arrow_extension_type(): ) def test_arrow_array(data, freq): import pyarrow as pa - from pandas.core.arrays.period import ArrowPeriodType + from pandas.core.arrays._arrow_utils import ArrowPeriodType periods = period_array(data, freq=freq) result = pa.array(periods) @@ -381,7 +381,7 @@ def test_arrow_array(data, freq): @pyarrow_skip def test_arrow_array_missing(): import pyarrow as pa - from pandas.core.arrays.period import ArrowPeriodType + from pandas.core.arrays._arrow_utils import ArrowPeriodType arr = PeriodArray([1, 2, 3], freq="D") arr[1] = pd.NaT @@ -396,7 +396,7 @@ def test_arrow_array_missing(): @pyarrow_skip def test_arrow_table_roundtrip(): import pyarrow as pa - from pandas.core.arrays.period import ArrowPeriodType + from pandas.core.arrays._arrow_utils import ArrowPeriodType arr = PeriodArray([1, 2, 3], freq="D") arr[1] = pd.NaT From e3037494cf87272011d8ba78d8220029d1a29ad8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 8 Jan 2020 13:47:49 +0100 Subject: [PATCH 15/15] update whatsnew for pyarrow next version --- doc/source/user_guide/io.rst | 4 ++-- doc/source/whatsnew/v1.0.0.rst | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 08dfc7f8990a8..fcf68522bd338 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -4649,9 +4649,9 @@ Several caveats. * In the ``pyarrow`` engine, categorical dtypes for non-string types can be serialized to parquet, but will de-serialize as their primitive dtype. * The ``pyarrow`` engine preserves the ``ordered`` flag of categorical dtypes with string types. ``fastparquet`` does not preserve the ``ordered`` flag. * Non supported types include ``Interval`` and actual Python object types. These will raise a helpful error message - on an attempt at serialization. ``Period`` type is supported with pyarrow >= 1.0.0. + on an attempt at serialization. ``Period`` type is supported with pyarrow >= 0.16.0. * The ``pyarrow`` engine preserves extension data types such as the nullable integer and string data - type (requiring pyarrow >= 1.0.0, and requiring the extension type to implement the needed protocols, + type (requiring pyarrow >= 0.16.0, and requiring the extension type to implement the needed protocols, see the :ref:`extension types documentation `). You can specify an ``engine`` to direct the serialization. This can be one of ``pyarrow``, or ``fastparquet``, or ``auto``. diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index ef7b408c2d8b4..8024284acc33d 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -206,7 +206,7 @@ Other enhancements - :meth:`Styler.format` added the ``na_rep`` parameter to help format the missing values (:issue:`21527`, :issue:`28358`) - Roundtripping DataFrames with nullable integer, string and period data types to parquet (:meth:`~DataFrame.to_parquet` / :func:`read_parquet`) using the `'pyarrow'` engine - now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`, :issue:`28371`). + now preserve those data types with pyarrow >= 0.16.0 (:issue:`20612`, :issue:`28371`). - The ``partition_cols`` argument in :meth:`DataFrame.to_parquet` now accepts a string (:issue:`27117`) - :func:`pandas.read_json` now parses ``NaN``, ``Infinity`` and ``-Infinity`` (:issue:`12213`) - The ``pandas.np`` submodule is now deprecated. Import numpy directly instead (:issue:`30296`)