Add map_variables and -99999 nan value to read_crn (#1368)

AdamRJensen · kandersolar · web-flow · commit c0c46b4dcb5b · 2022-02-17T06:47:40.000-07:00
* Add map_variables argument to read_crn

* Add test coverage for map_variables

* Update whatsnew

* Remove unnecessary tz_localize

* Replace nans with .replace instead of .where

* Extend documentation

* Simply test coverage of map_variables

* Use assert_index_equal instead of assert

Co-authored-by: Kevin Anderson &lt;57452607+kanderso-nrel@users.noreply.github.com&gt;

* Add import of assert_index_equal

* Add -99999 and -999999 to nan values

* Add -99999 nan bug to whatsnew

* Add -99999 to test file

* Update doc and whatsnew

* Remove -999999 from list of nans

* Minor doc update

* Add dictionary of nan values

* Change CRN_VARIABLE_MAP back to VARIABLE_MAP

* Remove numpy import

* Reformat setting of dtypes

Co-authored-by: Kevin Anderson &lt;57452607+kanderso-nrel@users.noreply.github.com&gt;
diff --git a/docs/sphinx/source/whatsnew/v0.9.1.rst b/docs/sphinx/source/whatsnew/v0.9.1.rst
@@ -16,6 +16,7 @@ Deprecations
 
 Enhancements
 ~~~~~~~~~~~~
+* Added ``map_variables`` option to :func:`~pvlib.iotools.read_crn` (:pull:`1368`)
 * Added :py:func:`pvlib.temperature.prilliman` for modeling cell temperature
   at short time steps (:issue:`1081`, :pull:`1391`)
 
@@ -29,6 +30,8 @@ Bug fixes
   argument was not being passed to the ``optimalinclination`` request parameter (:pull:`1356`)
 * Fixed bug in :py:func:`pvlib.bifacial.pvfactors_timeseries` where scalar ``surface_tilt``
   and ``surface_azimuth`` inputs caused an error (:issue:`1127`, :issue:`1332`, :pull:`1361`) 
+* Added -99999 to list of values to map to nan in :func:`~pvlib.iotools.read_crn`
+  (:issue:`1372`, :pull:`1368`)
 * Changed the metadata entry for the wind speed unit to "Wind Speed Units" in
   the PSM3 iotools function (:pull:`1375`)
 * Improved convergence when determining the maximum power point using
diff --git a/pvlib/data/CRN_with_problems.txt b/pvlib/data/CRN_with_problems.txt
diff --git a/pvlib/iotools/crn.py b/pvlib/iotools/crn.py
@@ -2,15 +2,14 @@
 """
 
 import pandas as pd
-import numpy as np
 
 
-HEADERS = (
-    'WBANNO UTC_DATE UTC_TIME LST_DATE LST_TIME CRX_VN LONGITUDE LATITUDE '
-    'AIR_TEMPERATURE PRECIPITATION SOLAR_RADIATION SR_FLAG '
-    'SURFACE_TEMPERATURE ST_TYPE ST_FLAG RELATIVE_HUMIDITY RH_FLAG '
-    'SOIL_MOISTURE_5 SOIL_TEMPERATURE_5 WETNESS WET_FLAG WIND_1_5 WIND_FLAG'
-)
+HEADERS = [
+    'WBANNO', 'UTC_DATE', 'UTC_TIME', 'LST_DATE', 'LST_TIME', 'CRX_VN',
+    'LONGITUDE', 'LATITUDE', 'AIR_TEMPERATURE', 'PRECIPITATION',
+    'SOLAR_RADIATION', 'SR_FLAG', 'SURFACE_TEMPERATURE', 'ST_TYPE', 'ST_FLAG',
+    'RELATIVE_HUMIDITY', 'RH_FLAG', 'SOIL_MOISTURE_5', 'SOIL_TEMPERATURE_5',
+    'WETNESS', 'WET_FLAG', 'WIND_1_5', 'WIND_FLAG']
 
 VARIABLE_MAP = {
     'LONGITUDE': 'longitude',
@@ -24,6 +23,21 @@
     'WIND_FLAG': 'wind_speed_flag'
 }
 
+NAN_DICT = {
+    'CRX_VN': -99999,
+    'AIR_TEMPERATURE': -9999,
+    'PRECIPITATION': -9999,
+    'SOLAR_RADIATION': -99999,
+    'SURFACE_TEMPERATURE': -9999,
+    'RELATIVE_HUMIDITY': -9999,
+    'SOIL_MOISTURE_5': -99,
+    'SOIL_TEMPERATURE_5': -9999,
+    'WETNESS': -9999,
+    'WIND_1_5': -99}
+
+# Add NUL characters to possible NaN values for all columns
+NAN_DICT = {k: [v, '\x00\x00\x00\x00\x00\x00'] for k, v in NAN_DICT.items()}
+
 # as specified in CRN README.txt file. excludes 1 space between columns
 WIDTHS = [5, 8, 4, 8, 4, 6, 7, 7, 7, 7, 6, 1, 7, 1, 1, 5, 1, 7, 7, 5, 1, 6, 1]
 # add 1 to make fields contiguous (required by pandas.read_fwf)
@@ -40,15 +54,22 @@
 ]
 
 
-def read_crn(filename):
-    """
-    Read a NOAA USCRN fixed-width file into pandas dataframe.  The CRN is
-    described in [1]_ and [2]_.
+def read_crn(filename, map_variables=True):
+    """Read a NOAA USCRN fixed-width file into a pandas dataframe.
+
+    The CRN network consists of over 100 meteorological stations covering the
+    U.S. and is described in [1]_ and [2]_. The primary goal of CRN is to
+    provide long-term measurements of temperature, precipitation, and soil
+    moisture and temperature. Additionally, global horizontal irradiance (GHI)
+    is measured at each site using a photodiode pyranometer.
 
     Parameters
     ----------
     filename: str, path object, or file-like
         filepath or url to read for the fixed-width file.
+    map_variables: boolean, default: True
+        When true, renames columns of the Dataframe to pvlib variable names
+        where applicable. See variable :const:`VARIABLE_MAP`.
 
     Returns
     -------
@@ -60,12 +81,12 @@ def read_crn(filename):
     -----
     CRN files contain 5 minute averages labeled by the interval ending
     time. Here, missing data is flagged as NaN, rather than the lowest
-    possible integer for a field (e.g. -999 or -99). Air temperature in
-    deg C. Wind speed in m/s at a height of 1.5 m above ground level.
+    possible integer for a field (e.g. -999 or -99). Air temperature is in
+    deg C and wind speed is in m/s at a height of 1.5 m above ground level.
 
-    Variables corresponding to standard pvlib variables are renamed,
+    Variables corresponding to standard pvlib variables are by default renamed,
     e.g. `SOLAR_RADIATION` becomes `ghi`. See the
-    `pvlib.iotools.crn.VARIABLE_MAP` dict for the complete mapping.
+    :const:`pvlib.iotools.crn.VARIABLE_MAP` dict for the complete mapping.
 
     CRN files occasionally have a set of null characters on a line
     instead of valid data. This function drops those lines. Sometimes
@@ -85,16 +106,13 @@ def read_crn(filename):
        Amer. Meteor. Soc., 94, 489-498. :doi:`10.1175/BAMS-D-12-00170.1`
     """
 
-    # read in data. set fields with NUL characters to NaN
-    data = pd.read_fwf(filename, header=None, names=HEADERS.split(' '),
-                       widths=WIDTHS, na_values=['\x00\x00\x00\x00\x00\x00'])
-    # at this point we only have NaNs from NUL characters, not -999 etc.
-    # these bad rows need to be removed so that dtypes can be set.
-    # NaNs require float dtype so we run into errors if we don't do this.
-    data = data.dropna(axis=0)
-    # loop here because dtype kwarg not supported in read_fwf until 0.20
-    for (col, _dtype) in zip(data.columns, DTYPES):
-        data[col] = data[col].astype(_dtype)
+    # read in data
+    data = pd.read_fwf(filename, header=None, names=HEADERS, widths=WIDTHS,
+                       na_values=NAN_DICT)
+    # Remove rows with all nans
+    data = data.dropna(axis=0, how='all')
+    # set dtypes here because dtype kwarg not supported in read_fwf until 0.20
+    data = data.astype(dict(zip(HEADERS, DTYPES)))
 
     # set index
     # UTC_TIME does not have leading 0s, so must zfill(4) to comply
@@ -103,19 +121,8 @@ def read_crn(filename):
     dtindex = pd.to_datetime(dts['UTC_DATE'] + dts['UTC_TIME'].str.zfill(4),
                              format='%Y%m%d%H%M', utc=True)
     data = data.set_index(dtindex)
-    try:
-        # to_datetime(utc=True) does not work in older versions of pandas
-        data = data.tz_localize('UTC')
-    except TypeError:
-        pass
-
-    # Now we can set nans. This could be done a per column basis to be
-    # safer, since in principle a real -99 value could occur in a -9999
-    # column. Very unlikely to see that in the real world.
-    for val in [-99, -999, -9999]:
-        # consider replacing with .replace([-99, -999, -9999])
-        data = data.where(data != val, np.nan)
-
-    data = data.rename(columns=VARIABLE_MAP)
+
+    if map_variables:
+        data = data.rename(columns=VARIABLE_MAP)
 
     return data
diff --git a/pvlib/tests/iotools/test_crn.py b/pvlib/tests/iotools/test_crn.py
@@ -3,11 +3,11 @@
 from numpy import dtype, nan
 import pytest
 from pvlib.iotools import crn
-from ..conftest import DATA_DIR, assert_frame_equal
+from ..conftest import DATA_DIR, assert_frame_equal, assert_index_equal
 
 
 @pytest.fixture
-def columns():
+def columns_mapped():
     return [
         'WBANNO', 'UTC_DATE', 'UTC_TIME', 'LST_DATE', 'LST_TIME', 'CRX_VN',
         'longitude', 'latitude', 'temp_air', 'PRECIPITATION', 'ghi',
@@ -17,6 +17,16 @@ def columns():
         'WETNESS', 'WET_FLAG', 'wind_speed', 'wind_speed_flag']
 
 
+@pytest.fixture
+def columns_unmapped():
+    return [
+        'WBANNO', 'UTC_DATE', 'UTC_TIME', 'LST_DATE', 'LST_TIME', 'CRX_VN',
+        'LONGITUDE', 'LATITUDE', 'AIR_TEMPERATURE', 'PRECIPITATION',
+        'SOLAR_RADIATION', 'SR_FLAG', 'SURFACE_TEMPERATURE', 'ST_TYPE',
+        'ST_FLAG', 'RELATIVE_HUMIDITY', 'RH_FLAG', 'SOIL_MOISTURE_5',
+        'SOIL_TEMPERATURE_5', 'WETNESS', 'WET_FLAG', 'WIND_1_5', 'WIND_FLAG']
+
+
 @pytest.fixture
 def dtypes():
     return [
@@ -39,7 +49,7 @@ def testfile_problems():
     return DATA_DIR / 'CRN_with_problems.txt'
 
 
-def test_read_crn(testfile, columns, dtypes):
+def test_read_crn(testfile, columns_mapped, dtypes):
     index = pd.DatetimeIndex(['2019-01-01 16:10:00',
                               '2019-01-01 16:15:00',
                               '2019-01-01 16:20:00',
@@ -54,25 +64,31 @@ def test_read_crn(testfile, columns, dtypes):
          0.0, 340.0, 0, 4.3, 'C', 0, 83.0, 0, nan, nan, 1183, 0, 0.53, 0],
         [53131, 20190101, 1625, 20190101, 925, 3, -111.17, 32.24, 4.0,
          0.0, 393.0, 0, 4.8, 'C', 0, 81.0, 0, nan, nan, 1223, 0, 0.64, 0]])
-    expected = pd.DataFrame(values, columns=columns, index=index)
+    expected = pd.DataFrame(values, columns=columns_mapped, index=index)
     for (col, _dtype) in zip(expected.columns, dtypes):
         expected[col] = expected[col].astype(_dtype)
     out = crn.read_crn(testfile)
     assert_frame_equal(out, expected)
 
 
-def test_read_crn_problems(testfile_problems, columns, dtypes):
+# Test map_variables=False returns correct column names
+def test_read_crn_map_variables(testfile, columns_unmapped, dtypes):
+    out = crn.read_crn(testfile, map_variables=False)
+    assert_index_equal(out.columns, pd.Index(columns_unmapped))
+
+
+def test_read_crn_problems(testfile_problems, columns_mapped, dtypes):
     # GH1025
     index = pd.DatetimeIndex(['2020-07-06 12:00:00',
                               '2020-07-06 13:10:00'],
                              freq=None).tz_localize('UTC')
     values = np.array([
-        [92821, 20200706, 1200, 20200706, 700, '3', -80.69, 28.62, 24.9,
-         0.0, 190.0, 0, 25.5, 'C', 0, 93.0, 0, nan, nan, 990, 0, 1.57, 0],
+        [92821, 20200706, 1200, 20200706, 700, '3.0', -80.69, 28.62, 24.9,
+         0.0, np.nan, 0, 25.5, 'C', 0, 93.0, 0, nan, nan, 990, 0, 1.57, 0],
         [92821, 20200706, 1310, 20200706, 810, '2.623', -80.69, 28.62,
          26.9, 0.0, 430.0, 0, 30.2, 'C', 0, 87.0, 0, nan, nan, 989, 0,
          1.64, 0]])
-    expected = pd.DataFrame(values, columns=columns, index=index)
+    expected = pd.DataFrame(values, columns=columns_mapped, index=index)
     for (col, _dtype) in zip(expected.columns, dtypes):
         expected[col] = expected[col].astype(_dtype)
     out = crn.read_crn(testfile_problems)