Skip to content

Commit 3194b3e

Browse files
xr.cov() and xr.corr() (#4089)
* Added chunks='auto' option in dataset.py * reverted accidental changes in dataset.chunk() * Added corr and cov to computation.py. Taken from r-beer:xarray/corr * Added r-beer's tests to test_computation.py Still issues I think * trying to fix github.com//pull/3550#discussion_r349935731 * Removing drop=True from the `.where()` calls in `computation.py`+test.py * api.rst and whats-new.rst * Updated `xarray/__init__.py` and added `broadcast` import to computation * added DataArray import to corr, cov * assert_allclose added to test_computation.py * removed whitespace in test_dask...oops * Added to init * format changes * Fiddling around with cov/corr tests in `test_computation.py` * PEP8 changes * pep * remove old todo and comments * isort * Added consistency check between corr() and cov(), ensure they give same * added `skipna=False` to `computation.py`. made consistency+autocov tests * formatting * Added numpy-based tests. * format * formatting again * Update doc/whats-new.rst Co-authored-by: keewis <[email protected]> * refactored corr/cov so there is one internal method for calculating both * formatting * updating docstrings and code suggestions from PR * paramterize ddof in tests * removed extraneous test arrays * formatting + adding deterministic docstring * added test for TypeError * formatting * tidying up docstring * formatting and tidying up `_cov_corr()` so that the logic is more clear * flake8 ... Co-authored-by: keewis <[email protected]>
1 parent bdb1d33 commit 3194b3e

File tree

5 files changed

+343
-3
lines changed

5 files changed

+343
-3
lines changed

doc/api.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ Top-level functions
2929
full_like
3030
zeros_like
3131
ones_like
32+
cov
33+
corr
3234
dot
3335
polyval
3436
map_blocks

doc/whats-new.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ Breaking changes
3636

3737
New Features
3838
~~~~~~~~~~~~
39+
- Added :py:func:`xarray.cov` and :py:func:`xarray.corr` (:issue:`3784`, :pull:`3550`, :pull:`4089`).
40+
By `Andrew Williams <https://github.com/AndrewWilliams3142>`_ and `Robin Beer <https://github.com/r-beer>`_.
3941
- Added :py:meth:`DataArray.polyfit` and :py:func:`xarray.polyval` for fitting polynomials. (:issue:`3349`)
4042
By `Pascal Bourgault <https://github.com/aulemahal>`_.
4143
- Control over attributes of result in :py:func:`merge`, :py:func:`concat`,

xarray/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from .core.alignment import align, broadcast
1818
from .core.combine import auto_combine, combine_by_coords, combine_nested
1919
from .core.common import ALL_DIMS, full_like, ones_like, zeros_like
20-
from .core.computation import apply_ufunc, dot, polyval, where
20+
from .core.computation import apply_ufunc, corr, cov, dot, polyval, where
2121
from .core.concat import concat
2222
from .core.dataarray import DataArray
2323
from .core.dataset import Dataset
@@ -54,6 +54,8 @@
5454
"concat",
5555
"decode_cf",
5656
"dot",
57+
"cov",
58+
"corr",
5759
"full_like",
5860
"load_dataarray",
5961
"load_dataset",

xarray/core/computation.py

Lines changed: 179 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
import numpy as np
2525

2626
from . import dtypes, duck_array_ops, utils
27-
from .alignment import deep_align
27+
from .alignment import align, deep_align
2828
from .merge import merge_coordinates_without_align
2929
from .options import OPTIONS
3030
from .pycompat import dask_array_type
@@ -1069,6 +1069,184 @@ def earth_mover_distance(first_samples,
10691069
return apply_array_ufunc(func, *args, dask=dask)
10701070

10711071

1072+
def cov(da_a, da_b, dim=None, ddof=1):
1073+
"""
1074+
Compute covariance between two DataArray objects along a shared dimension.
1075+
1076+
Parameters
1077+
----------
1078+
da_a: DataArray object
1079+
Array to compute.
1080+
da_b: DataArray object
1081+
Array to compute.
1082+
dim : str, optional
1083+
The dimension along which the covariance will be computed
1084+
ddof: int, optional
1085+
If ddof=1, covariance is normalized by N-1, giving an unbiased estimate,
1086+
else normalization is by N.
1087+
1088+
Returns
1089+
-------
1090+
covariance: DataArray
1091+
1092+
See also
1093+
--------
1094+
pandas.Series.cov: corresponding pandas function
1095+
xr.corr: respective function to calculate correlation
1096+
1097+
Examples
1098+
--------
1099+
>>> da_a = DataArray(np.array([[1, 2, 3], [0.1, 0.2, 0.3], [3.2, 0.6, 1.8]]),
1100+
... dims=("space", "time"),
1101+
... coords=[('space', ['IA', 'IL', 'IN']),
1102+
... ('time', pd.date_range("2000-01-01", freq="1D", periods=3))])
1103+
>>> da_a
1104+
<xarray.DataArray (space: 3, time: 3)>
1105+
array([[1. , 2. , 3. ],
1106+
[0.1, 0.2, 0.3],
1107+
[3.2, 0.6, 1.8]])
1108+
Coordinates:
1109+
* space (space) <U2 'IA' 'IL' 'IN'
1110+
* time (time) datetime64[ns] 2000-01-01 2000-01-02 2000-01-03
1111+
>>> da_a = DataArray(np.array([[0.2, 0.4, 0.6], [15, 10, 5], [3.2, 0.6, 1.8]]),
1112+
... dims=("space", "time"),
1113+
... coords=[('space', ['IA', 'IL', 'IN']),
1114+
... ('time', pd.date_range("2000-01-01", freq="1D", periods=3))])
1115+
>>> da_b
1116+
<xarray.DataArray (space: 3, time: 3)>
1117+
array([[ 0.2, 0.4, 0.6],
1118+
[15. , 10. , 5. ],
1119+
[ 3.2, 0.6, 1.8]])
1120+
Coordinates:
1121+
* space (space) <U2 'IA' 'IL' 'IN'
1122+
* time (time) datetime64[ns] 2000-01-01 2000-01-02 2000-01-03
1123+
>>> xr.cov(da_a, da_b)
1124+
<xarray.DataArray ()>
1125+
array(-3.53055556)
1126+
>>> xr.cov(da_a, da_b, dim='time')
1127+
<xarray.DataArray (space: 3)>
1128+
array([ 0.2, -0.5, 1.69333333])
1129+
Coordinates:
1130+
* space (space) <U2 'IA' 'IL' 'IN'
1131+
"""
1132+
from .dataarray import DataArray
1133+
1134+
if any(not isinstance(arr, DataArray) for arr in [da_a, da_b]):
1135+
raise TypeError(
1136+
"Only xr.DataArray is supported."
1137+
"Given {}.".format([type(arr) for arr in [da_a, da_b]])
1138+
)
1139+
1140+
return _cov_corr(da_a, da_b, dim=dim, ddof=ddof, method="cov")
1141+
1142+
1143+
def corr(da_a, da_b, dim=None):
1144+
"""
1145+
Compute the Pearson correlation coefficient between
1146+
two DataArray objects along a shared dimension.
1147+
1148+
Parameters
1149+
----------
1150+
da_a: DataArray object
1151+
Array to compute.
1152+
da_b: DataArray object
1153+
Array to compute.
1154+
dim: str, optional
1155+
The dimension along which the correlation will be computed
1156+
1157+
Returns
1158+
-------
1159+
correlation: DataArray
1160+
1161+
See also
1162+
--------
1163+
pandas.Series.corr: corresponding pandas function
1164+
xr.cov: underlying covariance function
1165+
1166+
Examples
1167+
--------
1168+
>>> da_a = DataArray(np.array([[1, 2, 3], [0.1, 0.2, 0.3], [3.2, 0.6, 1.8]]),
1169+
... dims=("space", "time"),
1170+
... coords=[('space', ['IA', 'IL', 'IN']),
1171+
... ('time', pd.date_range("2000-01-01", freq="1D", periods=3))])
1172+
>>> da_a
1173+
<xarray.DataArray (space: 3, time: 3)>
1174+
array([[1. , 2. , 3. ],
1175+
[0.1, 0.2, 0.3],
1176+
[3.2, 0.6, 1.8]])
1177+
Coordinates:
1178+
* space (space) <U2 'IA' 'IL' 'IN'
1179+
* time (time) datetime64[ns] 2000-01-01 2000-01-02 2000-01-03
1180+
>>> da_a = DataArray(np.array([[0.2, 0.4, 0.6], [15, 10, 5], [3.2, 0.6, 1.8]]),
1181+
... dims=("space", "time"),
1182+
... coords=[('space', ['IA', 'IL', 'IN']),
1183+
... ('time', pd.date_range("2000-01-01", freq="1D", periods=3))])
1184+
>>> da_b
1185+
<xarray.DataArray (space: 3, time: 3)>
1186+
array([[ 0.2, 0.4, 0.6],
1187+
[15. , 10. , 5. ],
1188+
[ 3.2, 0.6, 1.8]])
1189+
Coordinates:
1190+
* space (space) <U2 'IA' 'IL' 'IN'
1191+
* time (time) datetime64[ns] 2000-01-01 2000-01-02 2000-01-03
1192+
>>> xr.corr(da_a, da_b)
1193+
<xarray.DataArray ()>
1194+
array(-0.57087777)
1195+
>>> xr.corr(da_a, da_b, dim='time')
1196+
<xarray.DataArray (space: 3)>
1197+
array([ 1., -1., 1.])
1198+
Coordinates:
1199+
* space (space) <U2 'IA' 'IL' 'IN'
1200+
"""
1201+
from .dataarray import DataArray
1202+
1203+
if any(not isinstance(arr, DataArray) for arr in [da_a, da_b]):
1204+
raise TypeError(
1205+
"Only xr.DataArray is supported."
1206+
"Given {}.".format([type(arr) for arr in [da_a, da_b]])
1207+
)
1208+
1209+
return _cov_corr(da_a, da_b, dim=dim, method="corr")
1210+
1211+
1212+
def _cov_corr(da_a, da_b, dim=None, ddof=0, method=None):
1213+
"""
1214+
Internal method for xr.cov() and xr.corr() so only have to
1215+
sanitize the input arrays once and we don't repeat code.
1216+
"""
1217+
# 1. Broadcast the two arrays
1218+
da_a, da_b = align(da_a, da_b, join="inner", copy=False)
1219+
1220+
# 2. Ignore the nans
1221+
valid_values = da_a.notnull() & da_b.notnull()
1222+
1223+
if not valid_values.all():
1224+
da_a = da_a.where(valid_values)
1225+
da_b = da_b.where(valid_values)
1226+
1227+
valid_count = valid_values.sum(dim) - ddof
1228+
1229+
# 3. Detrend along the given dim
1230+
demeaned_da_a = da_a - da_a.mean(dim=dim)
1231+
demeaned_da_b = da_b - da_b.mean(dim=dim)
1232+
1233+
# 4. Compute covariance along the given dim
1234+
# N.B. `skipna=False` is required or there is a bug when computing
1235+
# auto-covariance. E.g. Try xr.cov(da,da) for
1236+
# da = xr.DataArray([[1, 2], [1, np.nan]], dims=["x", "time"])
1237+
cov = (demeaned_da_a * demeaned_da_b).sum(dim=dim, skipna=False) / (valid_count)
1238+
1239+
if method == "cov":
1240+
return cov
1241+
1242+
else:
1243+
# compute std + corr
1244+
da_a_std = da_a.std(dim=dim)
1245+
da_b_std = da_b.std(dim=dim)
1246+
corr = cov / (da_a_std * da_b_std)
1247+
return corr
1248+
1249+
10721250
def dot(*arrays, dims=None, **kwargs):
10731251
"""Generalized dot product for xarray objects. Like np.einsum, but
10741252
provides a simpler interface based on array dimensions.

0 commit comments

Comments
 (0)