Skip to content

ENH: support mask for missing values when writing data / support writing pandas nullable dtypes #232

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Apr 2, 2023
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
- Add "driver" property to `read_info` result (#224)
- Add support for dataset open options to `read`, `read_dataframe`, and
`read_info` (#233)
- Add support for pandas' nullable data types in `write_dataframe`, or
specifying a mask manually for missing values in `write` (#219)
- Standardized 3-dimensional geometry type labels from "2.5D <type>" to
"<type> Z" for consistency with well-known text (WKT) formats (#234)

Expand Down
17 changes: 15 additions & 2 deletions pyogrio/_io.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1404,7 +1404,7 @@ cdef infer_field_types(list dtypes):

# TODO: set geometry and field data as memory views?
def ogr_write(
str path, str layer, str driver, geometry, field_data, fields,
str path, str layer, str driver, geometry, fields, field_data, field_mask,
str crs, str geometry_type, str encoding, object dataset_kwargs,
object layer_kwargs, bint promote_to_multi=False, bint nan_as_null=True,
bint append=False
Expand Down Expand Up @@ -1442,6 +1442,15 @@ def ogr_write(
if len(field_data[i]) != num_records:
raise ValueError("field_data arrays must be same length as geometry array")

if field_mask is not None:
if len(field_data) != len(field_mask):
raise ValueError("field_data and field_mask must be same length")
for i in range(0, len(field_mask)):
if field_mask[i] is not None and len(field_mask[i]) != num_records:
raise ValueError("field_mask arrays must be same length as geometry array")
else:
field_mask = [None] * len(field_data)

path_b = path.encode('UTF-8')
path_c = path_b

Expand Down Expand Up @@ -1658,7 +1667,11 @@ def ogr_write(
field_value = field_data[field_idx][i]
field_type = field_types[field_idx][0]

if field_type == OFTString:
mask = field_mask[field_idx]
if mask is not None and mask[i]:
OGR_F_SetFieldNull(ogr_feature, field_idx)

elif field_type == OFTString:
# TODO: encode string using approach from _get_internal_encoding which checks layer capabilities
if (
field_value is None
Expand Down
18 changes: 17 additions & 1 deletion pyogrio/geopandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,22 @@ def write_dataframe(
fields = [c for c in df.columns if not c == geometry_column]

# TODO: may need to fill in pd.NA, etc
field_data = [df[f].values for f in fields]
field_data = []
field_mask = []
for name in fields:
col = df[name].values
if isinstance(col, pd.api.extensions.ExtensionArray):
from pandas.arrays import IntegerArray, FloatingArray, BooleanArray

if isinstance(col, (IntegerArray, FloatingArray, BooleanArray)):
field_data.append(col._data)
field_mask.append(col._mask)
else:
field_data.append(np.asarray(col))
field_mask.append(np.asarray(col.isna()))
else:
field_data.append(col)
field_mask.append(None)

# Determine geometry_type and/or promote_to_multi
if geometry_type is None or promote_to_multi is None:
Expand Down Expand Up @@ -386,6 +401,7 @@ def write_dataframe(
driver=driver,
geometry=to_wkb(geometry.values),
field_data=field_data,
field_mask=field_mask,
fields=fields,
crs=crs,
geometry_type=geometry_type,
Expand Down
2 changes: 2 additions & 0 deletions pyogrio/raw.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ def write(
geometry,
field_data,
fields,
field_mask=None,
layer=None,
driver=None,
# derived from meta if roundtrip
Expand Down Expand Up @@ -349,6 +350,7 @@ def write(
geometry=geometry,
geometry_type=geometry_type,
field_data=field_data,
field_mask=field_mask,
fields=fields,
crs=crs,
encoding=encoding,
Expand Down
22 changes: 22 additions & 0 deletions pyogrio/tests/test_geopandas_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -1022,3 +1022,25 @@ def test_read_dataset_kwargs(data_dir, use_arrow):
def test_read_invalid_dataset_kwargs(capfd, naturalearth_lowres, use_arrow):
read_dataframe(naturalearth_lowres, use_arrow=use_arrow, INVALID="YES")
assert "does not support open option INVALID" in capfd.readouterr().err


def test_write_nullable_dtypes(tmp_path):
path = tmp_path / "test_nullable_dtypes.gpkg"
test_data = {
"col1": pd.Series([1, 2, 3], dtype="int64"),
"col2": pd.Series([1, 2, None], dtype="Int64"),
"col3": pd.Series([0.1, None, 0.3], dtype="Float32"),
"col4": pd.Series([True, False, None], dtype="boolean"),
"col5": pd.Series(["a", None, "b"], dtype="string"),
}
input_gdf = gp.GeoDataFrame(test_data, geometry=[Point(0, 0)] * 3, crs="epsg:31370")
write_dataframe(input_gdf, path)
output_gdf = read_dataframe(path)
# We read it back as default (non-nullable) numpy dtypes, so we cast
# to those for the expected result
expected = input_gdf.copy()
expected["col2"] = expected["col2"].astype("float64")
expected["col3"] = expected["col3"].astype("float32")
expected["col4"] = expected["col4"].astype("float64")
expected["col5"] = expected["col5"].astype(object)
assert_geodataframe_equal(output_gdf, expected)
28 changes: 28 additions & 0 deletions pyogrio/tests/test_raw_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -806,3 +806,31 @@ def test_encoding_io_shapefile(tmp_path, read_encoding, write_encoding):
assert np.array_equal(
fields, read_info(filename, encoding=read_encoding)["fields"]
)


def test_write_with_mask(tmp_path):
# Point(0, 0), null
geometry = np.array(
[bytes.fromhex("010100000000000000000000000000000000000000")] * 3,
dtype=object,
)
field_data = [np.array([1, 2, 3], dtype="int32")]
field_mask = [np.array([False, True, False])]
fields = ["col"]
meta = dict(geometry_type="Point", crs="EPSG:4326")

filename = tmp_path / "test.geojson"
write(filename, geometry, field_data, fields, field_mask, **meta)
result_geometry, result_fields = read(filename)[2:]
assert np.array_equal(result_geometry, geometry)
np.testing.assert_allclose(result_fields[0], np.array([1, np.nan, 3]))

# wrong length for mask
field_mask = [np.array([False, True])]
with pytest.raises(ValueError):
write(filename, geometry, field_data, fields, field_mask, **meta)

# wrong number of mask arrays
field_mask = [np.array([False, True, False])] * 2
with pytest.raises(ValueError):
write(filename, geometry, field_data, fields, field_mask, **meta)