Skip to content

Commit c0c46b4

Browse files
Add map_variables and -99999 nan value to read_crn (#1368)
* Add map_variables argument to read_crn * Add test coverage for map_variables * Update whatsnew * Remove unnecessary tz_localize * Replace nans with .replace instead of .where * Extend documentation * Simply test coverage of map_variables * Use assert_index_equal instead of assert Co-authored-by: Kevin Anderson <[email protected]> * Add import of assert_index_equal * Add -99999 and -999999 to nan values * Add -99999 nan bug to whatsnew * Add -99999 to test file * Update doc and whatsnew * Remove -999999 from list of nans * Minor doc update * Add dictionary of nan values * Change CRN_VARIABLE_MAP back to VARIABLE_MAP * Remove numpy import * Reformat setting of dtypes Co-authored-by: Kevin Anderson <[email protected]>
1 parent 93e8404 commit c0c46b4

File tree

4 files changed

+73
-47
lines changed

4 files changed

+73
-47
lines changed

docs/sphinx/source/whatsnew/v0.9.1.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ Deprecations
1616

1717
Enhancements
1818
~~~~~~~~~~~~
19+
* Added ``map_variables`` option to :func:`~pvlib.iotools.read_crn` (:pull:`1368`)
1920
* Added :py:func:`pvlib.temperature.prilliman` for modeling cell temperature
2021
at short time steps (:issue:`1081`, :pull:`1391`)
2122

@@ -29,6 +30,8 @@ Bug fixes
2930
argument was not being passed to the ``optimalinclination`` request parameter (:pull:`1356`)
3031
* Fixed bug in :py:func:`pvlib.bifacial.pvfactors_timeseries` where scalar ``surface_tilt``
3132
and ``surface_azimuth`` inputs caused an error (:issue:`1127`, :issue:`1332`, :pull:`1361`)
33+
* Added -99999 to list of values to map to nan in :func:`~pvlib.iotools.read_crn`
34+
(:issue:`1372`, :pull:`1368`)
3235
* Changed the metadata entry for the wind speed unit to "Wind Speed Units" in
3336
the PSM3 iotools function (:pull:`1375`)
3437
* Improved convergence when determining the maximum power point using

pvlib/data/CRN_with_problems.txt

0 Bytes
Binary file not shown.

pvlib/iotools/crn.py

Lines changed: 46 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,14 @@
22
"""
33

44
import pandas as pd
5-
import numpy as np
65

76

8-
HEADERS = (
9-
'WBANNO UTC_DATE UTC_TIME LST_DATE LST_TIME CRX_VN LONGITUDE LATITUDE '
10-
'AIR_TEMPERATURE PRECIPITATION SOLAR_RADIATION SR_FLAG '
11-
'SURFACE_TEMPERATURE ST_TYPE ST_FLAG RELATIVE_HUMIDITY RH_FLAG '
12-
'SOIL_MOISTURE_5 SOIL_TEMPERATURE_5 WETNESS WET_FLAG WIND_1_5 WIND_FLAG'
13-
)
7+
HEADERS = [
8+
'WBANNO', 'UTC_DATE', 'UTC_TIME', 'LST_DATE', 'LST_TIME', 'CRX_VN',
9+
'LONGITUDE', 'LATITUDE', 'AIR_TEMPERATURE', 'PRECIPITATION',
10+
'SOLAR_RADIATION', 'SR_FLAG', 'SURFACE_TEMPERATURE', 'ST_TYPE', 'ST_FLAG',
11+
'RELATIVE_HUMIDITY', 'RH_FLAG', 'SOIL_MOISTURE_5', 'SOIL_TEMPERATURE_5',
12+
'WETNESS', 'WET_FLAG', 'WIND_1_5', 'WIND_FLAG']
1413

1514
VARIABLE_MAP = {
1615
'LONGITUDE': 'longitude',
@@ -24,6 +23,21 @@
2423
'WIND_FLAG': 'wind_speed_flag'
2524
}
2625

26+
NAN_DICT = {
27+
'CRX_VN': -99999,
28+
'AIR_TEMPERATURE': -9999,
29+
'PRECIPITATION': -9999,
30+
'SOLAR_RADIATION': -99999,
31+
'SURFACE_TEMPERATURE': -9999,
32+
'RELATIVE_HUMIDITY': -9999,
33+
'SOIL_MOISTURE_5': -99,
34+
'SOIL_TEMPERATURE_5': -9999,
35+
'WETNESS': -9999,
36+
'WIND_1_5': -99}
37+
38+
# Add NUL characters to possible NaN values for all columns
39+
NAN_DICT = {k: [v, '\x00\x00\x00\x00\x00\x00'] for k, v in NAN_DICT.items()}
40+
2741
# as specified in CRN README.txt file. excludes 1 space between columns
2842
WIDTHS = [5, 8, 4, 8, 4, 6, 7, 7, 7, 7, 6, 1, 7, 1, 1, 5, 1, 7, 7, 5, 1, 6, 1]
2943
# add 1 to make fields contiguous (required by pandas.read_fwf)
@@ -40,15 +54,22 @@
4054
]
4155

4256

43-
def read_crn(filename):
44-
"""
45-
Read a NOAA USCRN fixed-width file into pandas dataframe. The CRN is
46-
described in [1]_ and [2]_.
57+
def read_crn(filename, map_variables=True):
58+
"""Read a NOAA USCRN fixed-width file into a pandas dataframe.
59+
60+
The CRN network consists of over 100 meteorological stations covering the
61+
U.S. and is described in [1]_ and [2]_. The primary goal of CRN is to
62+
provide long-term measurements of temperature, precipitation, and soil
63+
moisture and temperature. Additionally, global horizontal irradiance (GHI)
64+
is measured at each site using a photodiode pyranometer.
4765
4866
Parameters
4967
----------
5068
filename: str, path object, or file-like
5169
filepath or url to read for the fixed-width file.
70+
map_variables: boolean, default: True
71+
When true, renames columns of the Dataframe to pvlib variable names
72+
where applicable. See variable :const:`VARIABLE_MAP`.
5273
5374
Returns
5475
-------
@@ -60,12 +81,12 @@ def read_crn(filename):
6081
-----
6182
CRN files contain 5 minute averages labeled by the interval ending
6283
time. Here, missing data is flagged as NaN, rather than the lowest
63-
possible integer for a field (e.g. -999 or -99). Air temperature in
64-
deg C. Wind speed in m/s at a height of 1.5 m above ground level.
84+
possible integer for a field (e.g. -999 or -99). Air temperature is in
85+
deg C and wind speed is in m/s at a height of 1.5 m above ground level.
6586
66-
Variables corresponding to standard pvlib variables are renamed,
87+
Variables corresponding to standard pvlib variables are by default renamed,
6788
e.g. `SOLAR_RADIATION` becomes `ghi`. See the
68-
`pvlib.iotools.crn.VARIABLE_MAP` dict for the complete mapping.
89+
:const:`pvlib.iotools.crn.VARIABLE_MAP` dict for the complete mapping.
6990
7091
CRN files occasionally have a set of null characters on a line
7192
instead of valid data. This function drops those lines. Sometimes
@@ -85,16 +106,13 @@ def read_crn(filename):
85106
Amer. Meteor. Soc., 94, 489-498. :doi:`10.1175/BAMS-D-12-00170.1`
86107
"""
87108

88-
# read in data. set fields with NUL characters to NaN
89-
data = pd.read_fwf(filename, header=None, names=HEADERS.split(' '),
90-
widths=WIDTHS, na_values=['\x00\x00\x00\x00\x00\x00'])
91-
# at this point we only have NaNs from NUL characters, not -999 etc.
92-
# these bad rows need to be removed so that dtypes can be set.
93-
# NaNs require float dtype so we run into errors if we don't do this.
94-
data = data.dropna(axis=0)
95-
# loop here because dtype kwarg not supported in read_fwf until 0.20
96-
for (col, _dtype) in zip(data.columns, DTYPES):
97-
data[col] = data[col].astype(_dtype)
109+
# read in data
110+
data = pd.read_fwf(filename, header=None, names=HEADERS, widths=WIDTHS,
111+
na_values=NAN_DICT)
112+
# Remove rows with all nans
113+
data = data.dropna(axis=0, how='all')
114+
# set dtypes here because dtype kwarg not supported in read_fwf until 0.20
115+
data = data.astype(dict(zip(HEADERS, DTYPES)))
98116

99117
# set index
100118
# UTC_TIME does not have leading 0s, so must zfill(4) to comply
@@ -103,19 +121,8 @@ def read_crn(filename):
103121
dtindex = pd.to_datetime(dts['UTC_DATE'] + dts['UTC_TIME'].str.zfill(4),
104122
format='%Y%m%d%H%M', utc=True)
105123
data = data.set_index(dtindex)
106-
try:
107-
# to_datetime(utc=True) does not work in older versions of pandas
108-
data = data.tz_localize('UTC')
109-
except TypeError:
110-
pass
111-
112-
# Now we can set nans. This could be done a per column basis to be
113-
# safer, since in principle a real -99 value could occur in a -9999
114-
# column. Very unlikely to see that in the real world.
115-
for val in [-99, -999, -9999]:
116-
# consider replacing with .replace([-99, -999, -9999])
117-
data = data.where(data != val, np.nan)
118-
119-
data = data.rename(columns=VARIABLE_MAP)
124+
125+
if map_variables:
126+
data = data.rename(columns=VARIABLE_MAP)
120127

121128
return data

pvlib/tests/iotools/test_crn.py

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,11 @@
33
from numpy import dtype, nan
44
import pytest
55
from pvlib.iotools import crn
6-
from ..conftest import DATA_DIR, assert_frame_equal
6+
from ..conftest import DATA_DIR, assert_frame_equal, assert_index_equal
77

88

99
@pytest.fixture
10-
def columns():
10+
def columns_mapped():
1111
return [
1212
'WBANNO', 'UTC_DATE', 'UTC_TIME', 'LST_DATE', 'LST_TIME', 'CRX_VN',
1313
'longitude', 'latitude', 'temp_air', 'PRECIPITATION', 'ghi',
@@ -17,6 +17,16 @@ def columns():
1717
'WETNESS', 'WET_FLAG', 'wind_speed', 'wind_speed_flag']
1818

1919

20+
@pytest.fixture
21+
def columns_unmapped():
22+
return [
23+
'WBANNO', 'UTC_DATE', 'UTC_TIME', 'LST_DATE', 'LST_TIME', 'CRX_VN',
24+
'LONGITUDE', 'LATITUDE', 'AIR_TEMPERATURE', 'PRECIPITATION',
25+
'SOLAR_RADIATION', 'SR_FLAG', 'SURFACE_TEMPERATURE', 'ST_TYPE',
26+
'ST_FLAG', 'RELATIVE_HUMIDITY', 'RH_FLAG', 'SOIL_MOISTURE_5',
27+
'SOIL_TEMPERATURE_5', 'WETNESS', 'WET_FLAG', 'WIND_1_5', 'WIND_FLAG']
28+
29+
2030
@pytest.fixture
2131
def dtypes():
2232
return [
@@ -39,7 +49,7 @@ def testfile_problems():
3949
return DATA_DIR / 'CRN_with_problems.txt'
4050

4151

42-
def test_read_crn(testfile, columns, dtypes):
52+
def test_read_crn(testfile, columns_mapped, dtypes):
4353
index = pd.DatetimeIndex(['2019-01-01 16:10:00',
4454
'2019-01-01 16:15:00',
4555
'2019-01-01 16:20:00',
@@ -54,25 +64,31 @@ def test_read_crn(testfile, columns, dtypes):
5464
0.0, 340.0, 0, 4.3, 'C', 0, 83.0, 0, nan, nan, 1183, 0, 0.53, 0],
5565
[53131, 20190101, 1625, 20190101, 925, 3, -111.17, 32.24, 4.0,
5666
0.0, 393.0, 0, 4.8, 'C', 0, 81.0, 0, nan, nan, 1223, 0, 0.64, 0]])
57-
expected = pd.DataFrame(values, columns=columns, index=index)
67+
expected = pd.DataFrame(values, columns=columns_mapped, index=index)
5868
for (col, _dtype) in zip(expected.columns, dtypes):
5969
expected[col] = expected[col].astype(_dtype)
6070
out = crn.read_crn(testfile)
6171
assert_frame_equal(out, expected)
6272

6373

64-
def test_read_crn_problems(testfile_problems, columns, dtypes):
74+
# Test map_variables=False returns correct column names
75+
def test_read_crn_map_variables(testfile, columns_unmapped, dtypes):
76+
out = crn.read_crn(testfile, map_variables=False)
77+
assert_index_equal(out.columns, pd.Index(columns_unmapped))
78+
79+
80+
def test_read_crn_problems(testfile_problems, columns_mapped, dtypes):
6581
# GH1025
6682
index = pd.DatetimeIndex(['2020-07-06 12:00:00',
6783
'2020-07-06 13:10:00'],
6884
freq=None).tz_localize('UTC')
6985
values = np.array([
70-
[92821, 20200706, 1200, 20200706, 700, '3', -80.69, 28.62, 24.9,
71-
0.0, 190.0, 0, 25.5, 'C', 0, 93.0, 0, nan, nan, 990, 0, 1.57, 0],
86+
[92821, 20200706, 1200, 20200706, 700, '3.0', -80.69, 28.62, 24.9,
87+
0.0, np.nan, 0, 25.5, 'C', 0, 93.0, 0, nan, nan, 990, 0, 1.57, 0],
7288
[92821, 20200706, 1310, 20200706, 810, '2.623', -80.69, 28.62,
7389
26.9, 0.0, 430.0, 0, 30.2, 'C', 0, 87.0, 0, nan, nan, 989, 0,
7490
1.64, 0]])
75-
expected = pd.DataFrame(values, columns=columns, index=index)
91+
expected = pd.DataFrame(values, columns=columns_mapped, index=index)
7692
for (col, _dtype) in zip(expected.columns, dtypes):
7793
expected[col] = expected[col].astype(_dtype)
7894
out = crn.read_crn(testfile_problems)

0 commit comments

Comments
 (0)