|
24 | 24 | import numpy as np
|
25 | 25 |
|
26 | 26 | from . import dtypes, duck_array_ops, utils
|
27 |
| -from .alignment import deep_align |
| 27 | +from .alignment import align, deep_align |
28 | 28 | from .merge import merge_coordinates_without_align
|
29 | 29 | from .options import OPTIONS
|
30 | 30 | from .pycompat import dask_array_type
|
@@ -1069,6 +1069,184 @@ def earth_mover_distance(first_samples,
|
1069 | 1069 | return apply_array_ufunc(func, *args, dask=dask)
|
1070 | 1070 |
|
1071 | 1071 |
|
| 1072 | +def cov(da_a, da_b, dim=None, ddof=1): |
| 1073 | + """ |
| 1074 | + Compute covariance between two DataArray objects along a shared dimension. |
| 1075 | +
|
| 1076 | + Parameters |
| 1077 | + ---------- |
| 1078 | + da_a: DataArray object |
| 1079 | + Array to compute. |
| 1080 | + da_b: DataArray object |
| 1081 | + Array to compute. |
| 1082 | + dim : str, optional |
| 1083 | + The dimension along which the covariance will be computed |
| 1084 | + ddof: int, optional |
| 1085 | + If ddof=1, covariance is normalized by N-1, giving an unbiased estimate, |
| 1086 | + else normalization is by N. |
| 1087 | +
|
| 1088 | + Returns |
| 1089 | + ------- |
| 1090 | + covariance: DataArray |
| 1091 | +
|
| 1092 | + See also |
| 1093 | + -------- |
| 1094 | + pandas.Series.cov: corresponding pandas function |
| 1095 | + xr.corr: respective function to calculate correlation |
| 1096 | +
|
| 1097 | + Examples |
| 1098 | + -------- |
| 1099 | + >>> da_a = DataArray(np.array([[1, 2, 3], [0.1, 0.2, 0.3], [3.2, 0.6, 1.8]]), |
| 1100 | + ... dims=("space", "time"), |
| 1101 | + ... coords=[('space', ['IA', 'IL', 'IN']), |
| 1102 | + ... ('time', pd.date_range("2000-01-01", freq="1D", periods=3))]) |
| 1103 | + >>> da_a |
| 1104 | + <xarray.DataArray (space: 3, time: 3)> |
| 1105 | + array([[1. , 2. , 3. ], |
| 1106 | + [0.1, 0.2, 0.3], |
| 1107 | + [3.2, 0.6, 1.8]]) |
| 1108 | + Coordinates: |
| 1109 | + * space (space) <U2 'IA' 'IL' 'IN' |
| 1110 | + * time (time) datetime64[ns] 2000-01-01 2000-01-02 2000-01-03 |
| 1111 | + >>> da_a = DataArray(np.array([[0.2, 0.4, 0.6], [15, 10, 5], [3.2, 0.6, 1.8]]), |
| 1112 | + ... dims=("space", "time"), |
| 1113 | + ... coords=[('space', ['IA', 'IL', 'IN']), |
| 1114 | + ... ('time', pd.date_range("2000-01-01", freq="1D", periods=3))]) |
| 1115 | + >>> da_b |
| 1116 | + <xarray.DataArray (space: 3, time: 3)> |
| 1117 | + array([[ 0.2, 0.4, 0.6], |
| 1118 | + [15. , 10. , 5. ], |
| 1119 | + [ 3.2, 0.6, 1.8]]) |
| 1120 | + Coordinates: |
| 1121 | + * space (space) <U2 'IA' 'IL' 'IN' |
| 1122 | + * time (time) datetime64[ns] 2000-01-01 2000-01-02 2000-01-03 |
| 1123 | + >>> xr.cov(da_a, da_b) |
| 1124 | + <xarray.DataArray ()> |
| 1125 | + array(-3.53055556) |
| 1126 | + >>> xr.cov(da_a, da_b, dim='time') |
| 1127 | + <xarray.DataArray (space: 3)> |
| 1128 | + array([ 0.2, -0.5, 1.69333333]) |
| 1129 | + Coordinates: |
| 1130 | + * space (space) <U2 'IA' 'IL' 'IN' |
| 1131 | + """ |
| 1132 | + from .dataarray import DataArray |
| 1133 | + |
| 1134 | + if any(not isinstance(arr, DataArray) for arr in [da_a, da_b]): |
| 1135 | + raise TypeError( |
| 1136 | + "Only xr.DataArray is supported." |
| 1137 | + "Given {}.".format([type(arr) for arr in [da_a, da_b]]) |
| 1138 | + ) |
| 1139 | + |
| 1140 | + return _cov_corr(da_a, da_b, dim=dim, ddof=ddof, method="cov") |
| 1141 | + |
| 1142 | + |
| 1143 | +def corr(da_a, da_b, dim=None): |
| 1144 | + """ |
| 1145 | + Compute the Pearson correlation coefficient between |
| 1146 | + two DataArray objects along a shared dimension. |
| 1147 | +
|
| 1148 | + Parameters |
| 1149 | + ---------- |
| 1150 | + da_a: DataArray object |
| 1151 | + Array to compute. |
| 1152 | + da_b: DataArray object |
| 1153 | + Array to compute. |
| 1154 | + dim: str, optional |
| 1155 | + The dimension along which the correlation will be computed |
| 1156 | +
|
| 1157 | + Returns |
| 1158 | + ------- |
| 1159 | + correlation: DataArray |
| 1160 | +
|
| 1161 | + See also |
| 1162 | + -------- |
| 1163 | + pandas.Series.corr: corresponding pandas function |
| 1164 | + xr.cov: underlying covariance function |
| 1165 | +
|
| 1166 | + Examples |
| 1167 | + -------- |
| 1168 | + >>> da_a = DataArray(np.array([[1, 2, 3], [0.1, 0.2, 0.3], [3.2, 0.6, 1.8]]), |
| 1169 | + ... dims=("space", "time"), |
| 1170 | + ... coords=[('space', ['IA', 'IL', 'IN']), |
| 1171 | + ... ('time', pd.date_range("2000-01-01", freq="1D", periods=3))]) |
| 1172 | + >>> da_a |
| 1173 | + <xarray.DataArray (space: 3, time: 3)> |
| 1174 | + array([[1. , 2. , 3. ], |
| 1175 | + [0.1, 0.2, 0.3], |
| 1176 | + [3.2, 0.6, 1.8]]) |
| 1177 | + Coordinates: |
| 1178 | + * space (space) <U2 'IA' 'IL' 'IN' |
| 1179 | + * time (time) datetime64[ns] 2000-01-01 2000-01-02 2000-01-03 |
| 1180 | + >>> da_a = DataArray(np.array([[0.2, 0.4, 0.6], [15, 10, 5], [3.2, 0.6, 1.8]]), |
| 1181 | + ... dims=("space", "time"), |
| 1182 | + ... coords=[('space', ['IA', 'IL', 'IN']), |
| 1183 | + ... ('time', pd.date_range("2000-01-01", freq="1D", periods=3))]) |
| 1184 | + >>> da_b |
| 1185 | + <xarray.DataArray (space: 3, time: 3)> |
| 1186 | + array([[ 0.2, 0.4, 0.6], |
| 1187 | + [15. , 10. , 5. ], |
| 1188 | + [ 3.2, 0.6, 1.8]]) |
| 1189 | + Coordinates: |
| 1190 | + * space (space) <U2 'IA' 'IL' 'IN' |
| 1191 | + * time (time) datetime64[ns] 2000-01-01 2000-01-02 2000-01-03 |
| 1192 | + >>> xr.corr(da_a, da_b) |
| 1193 | + <xarray.DataArray ()> |
| 1194 | + array(-0.57087777) |
| 1195 | + >>> xr.corr(da_a, da_b, dim='time') |
| 1196 | + <xarray.DataArray (space: 3)> |
| 1197 | + array([ 1., -1., 1.]) |
| 1198 | + Coordinates: |
| 1199 | + * space (space) <U2 'IA' 'IL' 'IN' |
| 1200 | + """ |
| 1201 | + from .dataarray import DataArray |
| 1202 | + |
| 1203 | + if any(not isinstance(arr, DataArray) for arr in [da_a, da_b]): |
| 1204 | + raise TypeError( |
| 1205 | + "Only xr.DataArray is supported." |
| 1206 | + "Given {}.".format([type(arr) for arr in [da_a, da_b]]) |
| 1207 | + ) |
| 1208 | + |
| 1209 | + return _cov_corr(da_a, da_b, dim=dim, method="corr") |
| 1210 | + |
| 1211 | + |
| 1212 | +def _cov_corr(da_a, da_b, dim=None, ddof=0, method=None): |
| 1213 | + """ |
| 1214 | + Internal method for xr.cov() and xr.corr() so only have to |
| 1215 | + sanitize the input arrays once and we don't repeat code. |
| 1216 | + """ |
| 1217 | + # 1. Broadcast the two arrays |
| 1218 | + da_a, da_b = align(da_a, da_b, join="inner", copy=False) |
| 1219 | + |
| 1220 | + # 2. Ignore the nans |
| 1221 | + valid_values = da_a.notnull() & da_b.notnull() |
| 1222 | + |
| 1223 | + if not valid_values.all(): |
| 1224 | + da_a = da_a.where(valid_values) |
| 1225 | + da_b = da_b.where(valid_values) |
| 1226 | + |
| 1227 | + valid_count = valid_values.sum(dim) - ddof |
| 1228 | + |
| 1229 | + # 3. Detrend along the given dim |
| 1230 | + demeaned_da_a = da_a - da_a.mean(dim=dim) |
| 1231 | + demeaned_da_b = da_b - da_b.mean(dim=dim) |
| 1232 | + |
| 1233 | + # 4. Compute covariance along the given dim |
| 1234 | + # N.B. `skipna=False` is required or there is a bug when computing |
| 1235 | + # auto-covariance. E.g. Try xr.cov(da,da) for |
| 1236 | + # da = xr.DataArray([[1, 2], [1, np.nan]], dims=["x", "time"]) |
| 1237 | + cov = (demeaned_da_a * demeaned_da_b).sum(dim=dim, skipna=False) / (valid_count) |
| 1238 | + |
| 1239 | + if method == "cov": |
| 1240 | + return cov |
| 1241 | + |
| 1242 | + else: |
| 1243 | + # compute std + corr |
| 1244 | + da_a_std = da_a.std(dim=dim) |
| 1245 | + da_b_std = da_b.std(dim=dim) |
| 1246 | + corr = cov / (da_a_std * da_b_std) |
| 1247 | + return corr |
| 1248 | + |
| 1249 | + |
1072 | 1250 | def dot(*arrays, dims=None, **kwargs):
|
1073 | 1251 | """Generalized dot product for xarray objects. Like np.einsum, but
|
1074 | 1252 | provides a simpler interface based on array dimensions.
|
|
0 commit comments