diff --git a/.bumpversion.cfg b/.bumpversion.cfg index fe7064afc..de364861c 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.3.54 +current_version = 0.3.55 commit = True message = chore: bump covidcast-indicators to {new_version} tag = False diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 000000000..9b48a931a --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,6 @@ +# Format geomap.py +d4b056e7a4c11982324e9224c9f9f6fd5d5ec65c +# Format test_geomap.py +79072dcdec3faca9aaeeea65de83f7fa5c00d53f +# Sort setup.py dependencies +6912077acba97e835aff7d0cd3d64309a1a9241d \ No newline at end of file diff --git a/.github/workflows/backfill-corr-ci.yml b/.github/workflows/backfill-corr-ci.yml index 3143050eb..23eb8c0d1 100644 --- a/.github/workflows/backfill-corr-ci.yml +++ b/.github/workflows/backfill-corr-ci.yml @@ -10,49 +10,28 @@ name: R backfill corrections on: push: - branches: [ main, prod ] + branches: [main, prod] pull_request: - types: [ opened, synchronize, reopened, ready_for_review ] - branches: [ main, prod ] + types: [opened, synchronize, reopened, ready_for_review] + branches: [main, prod] jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest if: github.event.pull_request.draft == false - strategy: - matrix: - r-version: [4.2.1] defaults: run: working-directory: backfill_corrections/delphiBackfillCorrection steps: - - uses: actions/checkout@v2 - - name: Set up R ${{ matrix.r-version }} + - uses: actions/checkout@v4 + + - name: Set up R 4.2 uses: r-lib/actions/setup-r@v2 with: - r-version: ${{ matrix.r-version }} use-public-rspm: true - - name: Install linux dependencies - run: | - sudo apt-get install \ - libcurl4-openssl-dev \ - libgdal-dev \ - libudunits2-dev \ - libglpk-dev \ - libharfbuzz-dev \ - libfribidi-dev - - name: Get date - id: get-date - run: | - echo "::set-output name=date::$(/bin/date -u "+%Y%m%d")" - - name: Cache R packages - uses: actions/cache@v2 - with: - path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-r-backfillcorr-${{ steps.get-date.outputs.date }} - restore-keys: | - ${{ runner.os }}-r-backfillcorr- + r-version: 4.2 + - name: Install and cache dependencies env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} @@ -60,7 +39,8 @@ jobs: with: extra-packages: any::rcmdcheck working-directory: backfill_corrections/delphiBackfillCorrection - upgrade: 'TRUE' + upgrade: "TRUE" + - name: Check package uses: r-lib/actions/check-r-package@v2 with: diff --git a/.github/workflows/build-container-images.yml b/.github/workflows/build-container-images.yml index 87d9b5446..18eaeab35 100644 --- a/.github/workflows/build-container-images.yml +++ b/.github/workflows/build-container-images.yml @@ -2,14 +2,15 @@ name: Build indicator container images and upload to registry on: push: - branches: [ main, prod ] + branches: [main, prod] + workflow_dispatch: jobs: build: runs-on: ubuntu-latest strategy: matrix: - packages: [ backfill_corrections ] + packages: [backfill_corrections] steps: - name: Checkout code uses: actions/checkout@v2 diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index 0534cbce2..3e1ee9689 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -16,28 +16,42 @@ jobs: if: github.event.pull_request.draft == false strategy: matrix: - packages: - [ - _delphi_utils_python, - changehc, - claims_hosp, - doctor_visits, - google_symptoms, - hhs_hosp, - nchs_mortality, - nwss_wastewater, - quidel_covidtest, - sir_complainsalot, - ] + include: + - package: "_delphi_utils_python" + dir: "delphi_utils" + - package: "changehc" + dir: "delphi_changehc" + - package: "claims_hosp" + dir: "delphi_claims_hosp" + - package: "doctor_visits" + dir: "delphi_doctor_visits" + - package: "google_symptoms" + dir: "delphi_google_symptoms" + - package: "hhs_hosp" + dir: "delphi_hhs" + - package: "nchs_mortality" + dir: "delphi_nchs_mortality" + - package: "nssp" + dir: "delphi_nssp" + - package: "nwss_wastewater" + dir: "delphi_nwss" + - package: "quidel_covidtest" + dir: "delphi_quidel_covidtest" + - package: "sir_complainsalot" + dir: "delphi_sir_complainsalot" defaults: run: - working-directory: ${{ matrix.packages }} + working-directory: ${{ matrix.package }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 + with: + fetch-depth: 0 - name: Set up Python 3.8 - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: 3.8 + cache: "pip" + cache-dependency-path: "setup.py" - name: Install testing dependencies run: | python -m pip install --upgrade pip @@ -51,3 +65,8 @@ jobs: - name: Test run: | make test + - uses: akaihola/darker@v2.1.1 + with: + options: "--check --diff --isort --color" + src: "${{ matrix.package }}/${{ matrix.dir }}" + version: "~=2.1.1" diff --git a/Jenkinsfile b/Jenkinsfile index 0052fd215..3011ebde7 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -10,7 +10,7 @@ - TODO: #527 Get this list automatically from python-ci.yml at runtime. */ -def indicator_list = ["backfill_corrections", "changehc", "claims_hosp", "google_symptoms", "hhs_hosp", "nchs_mortality", "quidel_covidtest", "sir_complainsalot", "doctor_visits", "nwss_wastewater"] +def indicator_list = ["backfill_corrections", "changehc", "claims_hosp", "google_symptoms", "hhs_hosp", "nchs_mortality", "quidel_covidtest", "sir_complainsalot", "doctor_visits", "nwss_wastewater", "nssp"] def build_package_main = [:] def build_package_prod = [:] def deploy_staging = [:] diff --git a/README.md b/README.md index 049b3ad49..3d4f8d161 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ In early April 2020, Delphi developed a uniform data schema for [a new Epidata endpoint focused on COVID-19](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html). Our intent was to provide signals that would track in real-time and in fine geographic granularity all facets of the COVID-19 pandemic, aiding both nowcasting and forecasting. Delphi's long history in tracking and forecasting influenza made us uniquely situated to provide access to data streams not available anywhere else, including medical claims data, electronic medical records, lab test records, massive public surveys, and internet search trends. We also process commonly-used publicly-available data sources, both for user convenience and to provide data versioning for sources that do not track revisions themselves. -Each data stream arrives in a different format using a different delivery technique, be it sftp, an access-controlled API, or an email attachment. The purpose of each pipeline in this repository is to fetch the raw source data, extract informative aggregate signals, and output those signals---which we call **COVID-19 indicators**---in a common format for upload to the [COVIDcast API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html). +Each data stream arrives in a different format using a different delivery technique, be it sftp, an access-controlled API, or an email attachment. The purpose of each pipeline in this repository is to fetch the raw source data, extract informative aggregate signals, and output those signals---which we call **COVID-19 indicators**---in a common format for upload to the [COVIDcast API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html). For client access to the API, along with a variety of other utilities, see our [R](https://cmu-delphi.github.io/covidcast/covidcastR/) and [Python](https://cmu-delphi.github.io/covidcast/covidcast-py/html/) packages. @@ -13,18 +13,19 @@ For interactive visualizations (of a subset of the available indicators), see ou ## Organization Utilities: -* `_delphi_utils_python` - common behaviors -* `_template_python` & `_template_r` - starting points for new data sources -* `ansible` & `jenkins` - automated testing and deployment -* `sir_complainsalot` - a Slack bot to check for missing data + +- `_delphi_utils_python` - common behaviors +- `_template_python` & `_template_r` - starting points for new data sources +- `ansible` & `jenkins` - automated testing and deployment +- `sir_complainsalot` - a Slack bot to check for missing data Indicator pipelines: all remaining directories. -Each indicator pipeline includes its own documentation. +Each indicator pipeline includes its own documentation. -* Consult README.md for directions to install, lint, test, and run the pipeline for that indicator. -* Consult REVIEW.md for the checklist to use for code reviews. -* Consult DETAILS.md (if present) for implementation details, including handling of corner cases. +- Consult README.md for directions to install, lint, test, and run the pipeline for that indicator. +- Consult REVIEW.md for the checklist to use for code reviews. +- Consult DETAILS.md (if present) for implementation details, including handling of corner cases. ## Development @@ -35,6 +36,28 @@ Each indicator pipeline includes its own documentation. 3. Add new commits to your branch in response to feedback. 4. When approved, tag an admin to merge the PR. Let them know if this change should be released immediately, at a set future date, or if it can just go along for the ride whenever the next release happens. +### Linting and Formatting + +Each indicator has a `make lint` command to check for linting errors and a `make +format` command to incrementally format your code (using +[darker](https://github.com/akaihola/darker)). These are both automated with a +[Github Action](.github/workflows/python-ci.yml). + +If you get the error `ERROR:darker.git:fatal: Not a valid commit name `, +then it's likely because your local main branch is not up to date; either you +need to rebase or merge. Note that `darker` reads from `pyproject.toml` for +default settings. + +If the lines you change are in a file that uses 2 space indentation, `darker` +will indent the lines around your changes and not the rest, which will likely +break the code; in that case, you should probably just pass the whole file +through black. You can do that with the following command (using the same +virtual environment as above): + +```sh +env/bin/black +``` + ## Release Process The release process consists of multiple steps which can all be done via the GitHub website: diff --git a/_delphi_utils_python/.bumpversion.cfg b/_delphi_utils_python/.bumpversion.cfg index 3d4bc08a0..722a91e30 100644 --- a/_delphi_utils_python/.bumpversion.cfg +++ b/_delphi_utils_python/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.3.23 +current_version = 0.3.24 commit = True message = chore: bump delphi_utils to {new_version} tag = False diff --git a/_delphi_utils_python/.pylintrc b/_delphi_utils_python/.pylintrc deleted file mode 100644 index ad0180ed7..000000000 --- a/_delphi_utils_python/.pylintrc +++ /dev/null @@ -1,22 +0,0 @@ - -[MESSAGES CONTROL] - -disable=logging-format-interpolation, - too-many-locals, - too-many-arguments, - # Allow pytest functions to be part of a class. - no-self-use, - # Allow pytest classes to have one test. - too-few-public-methods - -[BASIC] - -# Allow arbitrarily short-named variables. -variable-rgx=([a-z_][a-z0-9_]*|[a-zA-Z]) -argument-rgx=([a-z_][a-z0-9_]*|[a-zA-Z]) -attr-rgx=([a-z_][a-z0-9_]*|[a-zA-Z]) - -[DESIGN] - -# Don't complain about pytest "unused" arguments. -ignored-argument-names=(_.*|run_as_module) \ No newline at end of file diff --git a/_delphi_utils_python/Makefile b/_delphi_utils_python/Makefile index dd9c5f37f..79d7f7943 100644 --- a/_delphi_utils_python/Makefile +++ b/_delphi_utils_python/Makefile @@ -14,9 +14,12 @@ install-ci: venv pip install . lint: - . env/bin/activate; pylint delphi_utils + . env/bin/activate; pylint delphi_utils --rcfile=../pyproject.toml . env/bin/activate; pydocstyle delphi_utils +format: + . env/bin/activate; darker delphi_utils + test: . env/bin/activate ;\ (cd tests && ../env/bin/pytest --cov=delphi_utils --cov-report=term-missing) diff --git a/_delphi_utils_python/data_proc/geomap/README.md b/_delphi_utils_python/data_proc/geomap/README.md index 08075fff9..38297b691 100644 --- a/_delphi_utils_python/data_proc/geomap/README.md +++ b/_delphi_utils_python/data_proc/geomap/README.md @@ -1,4 +1,4 @@ -# Geocoding data processing pipeline +# Geocoding Data Processing Authors: Jingjing Tang, James Sharpnack, Dmitry Shemetov @@ -7,42 +7,37 @@ Authors: Jingjing Tang, James Sharpnack, Dmitry Shemetov Requires the following source files below. Run the following to build the crosswalk tables in `covidcast-indicators/_delph_utils_python/delph_utils/data` -``` + +```sh $ python geo_data_proc.py ``` -You can see consistency checks and diffs with old sources in ./consistency_checks.ipynb +Find data consistency checks in `./source-file-sanity-check.ipynb`. ## Geo Codes We support the following geocodes. -- The ZIP code and the FIPS code are the most granular geocodes we support. - - The [ZIP code](https://en.wikipedia.org/wiki/ZIP_Code) is a US postal code used by the USPS and the [FIPS code](https://en.wikipedia.org/wiki/FIPS_county_code) is an identifier for US counties and other associated territories. The ZIP code is five digit code (with leading zeros). - - The FIPS code is a five digit code (with leading zeros), where the first two digits are a two-digit state code and the last three are a three-digit county code (see this [US Census Bureau page](https://www.census.gov/library/reference/code-lists/ansi.html) for detailed information). -- The Metropolitan Statistical Area (MSA) code refers to regions around cities (these are sometimes referred to as CBSA codes). More information on these can be found at the [US Census Bureau](https://www.census.gov/programs-surveys/metro-micro/about.html). - - We are reserving 10001-10099 for states codes of the form 100XX where XX is the FIPS code for the state (the current smallest CBSA is 10100). In the case that the CBSA codes change then it should be verified that these are not used. +- The [ZIP code](https://en.wikipedia.org/wiki/ZIP_Code) is a US postal code used by the USPS and the [FIPS code](https://en.wikipedia.org/wiki/FIPS_county_code) is an identifier for US counties and other associated territories. The ZIP code is five digit code (with leading zeros). +- The FIPS code is a five digit code (with leading zeros), where the first two digits are a two-digit state code and the last three are a three-digit county code (see this [US Census Bureau page](https://www.census.gov/library/reference/code-lists/ansi.html) for detailed information). +- The Metropolitan Statistical Area (MSA) code refers to regions around cities (these are sometimes referred to as CBSA codes). More information on these can be found at the [US Census Bureau](https://www.census.gov/programs-surveys/metro-micro/about.html). We rserve 10001-10099 for states codes of the form 100XX where XX is the FIPS code for the state (the current smallest CBSA is 10100). In the case that the CBSA codes change then it should be verified that these are not used. - State codes are a series of equivalent identifiers for US state. They include the state name, the state number (state_id), and the state two-letter abbreviation (state_code). The state number is the state FIPS code. See [here](https://en.wikipedia.org/wiki/List_of_U.S._state_and_territory_abbreviations) for more. - The Hospital Referral Region (HRR) and the Hospital Service Area (HSA). More information [here](https://www.dartmouthatlas.org/covid-19/hrr-mapping/). -FIPS codes depart in some special cases, so we produce manual changes listed below. -## Source files +## Source Files The source files are requested from a government URL when `geo_data_proc.py` is run (see the top of said script for the URLs). Below we describe the locations to find updated versions of the source files, if they are ever needed. - ZIP -> FIPS (county) population tables available from [US Census](https://www.census.gov/geographies/reference-files/time-series/geo/relationship-files.html#par_textimage_674173622). This file contains the population of the intersections between ZIP and FIPS regions, allowing the creation of a population-weighted transform between the two. As of 4 February 2022, this source did not include population information for 24 ZIPs that appear in our indicators. We have added those values manually using information available from the [zipdatamaps website](www.zipdatamaps.com). - ZIP -> HRR -> HSA crosswalk file comes from the 2018 version at the [Dartmouth Atlas Project](https://atlasdata.dartmouth.edu/static/supp_research_data). - FIPS -> MSA crosswalk file comes from the September 2018 version of the delineation files at the [US Census Bureau](https://www.census.gov/geographies/reference-files/time-series/demo/metro-micro/delineation-files.html). -- State Code -> State ID -> State Name comes from the ANSI standard at the [US Census](https://www.census.gov/library/reference/code-lists/ansi.html#par_textimage_3). The first two digits of a FIPS codes should match the state code here. +- State Code -> State ID -> State Name comes from the ANSI standard at the [US Census](https://www.census.gov/library/reference/code-lists/ansi.html#par_textimage_3). - -## Derived files +## Derived Files The rest of the crosswalk tables are derived from the mappings above. We provide crosswalk functions from granular to coarser codes, but not the other way around. This is because there is no information gained when crosswalking from coarse to granular. - - -## Deprecated source files +## Deprecated Source Files - ZIP to FIPS to HRR to states: `02_20_uszips.csv` comes from a version of the table [here](https://simplemaps.com/data/us-zips) modified by Jingjing to include population weights. - The `02_20_uszips.csv` file is based on the newest consensus data including 5-digit zipcode, fips code, county name, state, population, HRR, HSA (I downloaded the original file from [here](https://simplemaps.com/data/us-zips). This file matches best to the most recent (2020) situation in terms of the population. But there still exist some matching problems. I manually checked and corrected those lines (~20) with [zip-codes](https://www.zip-codes.com/zip-code/58439/zip-code-58439.asp). The mapping from 5-digit zipcode to HRR is based on the file in 2017 version downloaded from [here](https://atlasdata.dartmouth.edu/static/supp_research_data). @@ -51,7 +46,3 @@ The rest of the crosswalk tables are derived from the mappings above. We provide - CBSA -> FIPS crosswalk from [here](https://data.nber.org/data/cbsa-fips-county-crosswalk.html) (the file is `cbsatocountycrosswalk.csv`). - MSA tables from March 2020 [here](https://www.census.gov/geographies/reference-files/time-series/demo/metro-micro/delineation-files.html). This file seems to differ in a few fips codes from the source for the 02_20_uszip file which Jingjing constructed. There are at least 10 additional fips in 03_20_msa that are not in the uszip file, and one of the msa codes seems to be incorrect: 49020 (a google search confirms that it is incorrect in uszip and correct in the census data). - MSA tables from 2019 [here](https://apps.bea.gov/regional/docs/msalist.cfm) - -## Notes - -- The NAs in the coding currently zero-fills. diff --git a/_delphi_utils_python/data_proc/geomap/geo_data_proc.py b/_delphi_utils_python/data_proc/geomap/geo_data_proc.py index 287667812..5634d6f83 100755 --- a/_delphi_utils_python/data_proc/geomap/geo_data_proc.py +++ b/_delphi_utils_python/data_proc/geomap/geo_data_proc.py @@ -1,10 +1,7 @@ """ -Authors: Dmitry Shemetov @dshemetov, James Sharpnack @jsharpna - -Intended execution: +Authors: Dmitry Shemetov, James Sharpnack cd _delphi_utils/data_proc/geomap -chmod u+x geo_data_proc.py python geo_data_proc.py """ @@ -12,7 +9,6 @@ from os import remove, listdir from os.path import join, isfile from zipfile import ZipFile -from pandas.core.frame import DataFrame import requests import pandas as pd @@ -20,7 +16,7 @@ # Source files -YEAR = 2019 +YEAR = 2020 INPUT_DIR = "./old_source_files" OUTPUT_DIR = f"../../delphi_utils/data/{YEAR}" FIPS_BY_ZIP_POP_URL = "https://www2.census.gov/geo/docs/maps-data/data/rel/zcta_county_rel_10.txt?#" @@ -42,7 +38,6 @@ FIPS_HHS_FILENAME = "fips_hhs_table.csv" FIPS_CHNGFIPS_OUT_FILENAME = "fips_chng-fips_table.csv" FIPS_POPULATION_OUT_FILENAME = "fips_pop.csv" - CHNGFIPS_STATE_OUT_FILENAME = "chng-fips_state_table.csv" ZIP_HSA_OUT_FILENAME = "zip_hsa_table.csv" ZIP_HRR_OUT_FILENAME = "zip_hrr_table.csv" @@ -70,8 +65,8 @@ def create_fips_zip_crosswalk(): # Find the population fractions (the heaviest computation, takes about a minute) # Note that the denominator in the fractions is the source population pop_df.set_index(["fips", "zip"], inplace=True) - fips_zip: DataFrame = pop_df.groupby("fips", as_index=False).apply(lambda g: g["pop"] / g["pop"].sum()) - zip_fips: DataFrame = pop_df.groupby("zip", as_index=False).apply(lambda g: g["pop"] / g["pop"].sum()) + fips_zip: pd.DataFrame = pop_df.groupby("fips", as_index=False).apply(lambda g: g["pop"] / g["pop"].sum()) + zip_fips: pd.DataFrame = pop_df.groupby("zip", as_index=False).apply(lambda g: g["pop"] / g["pop"].sum()) # Rename and write to file fips_zip = fips_zip.reset_index(level=["fips", "zip"]).rename(columns={"pop": "weight"}).query("weight > 0.0") @@ -228,7 +223,7 @@ def create_state_population_table(): derive_fips_state_crosswalk() census_pop = pd.read_csv(join(OUTPUT_DIR, FIPS_POPULATION_OUT_FILENAME), dtype={"fips": str, "pop": int}) - state: DataFrame = pd.read_csv(join(OUTPUT_DIR, FIPS_STATE_OUT_FILENAME), dtype=str) + state: pd.DataFrame = pd.read_csv(join(OUTPUT_DIR, FIPS_STATE_OUT_FILENAME), dtype=str) state_pop = state.merge(census_pop, on="fips").groupby(["state_code", "state_id", "state_name"], as_index=False).sum() state_pop.sort_values("state_code").to_csv(join(OUTPUT_DIR, STATE_POPULATION_OUT_FILENAME), index=False) diff --git a/_delphi_utils_python/delphi_utils/__init__.py b/_delphi_utils_python/delphi_utils/__init__.py index 2a9893a5e..7ff828440 100644 --- a/_delphi_utils_python/delphi_utils/__init__.py +++ b/_delphi_utils_python/delphi_utils/__init__.py @@ -15,4 +15,4 @@ from .nancodes import Nans from .weekday import Weekday -__version__ = "0.3.23" +__version__ = "0.3.24" diff --git a/_delphi_utils_python/delphi_utils/geomap.py b/_delphi_utils_python/delphi_utils/geomap.py index f43b80504..be6df2d24 100644 --- a/_delphi_utils_python/delphi_utils/geomap.py +++ b/_delphi_utils_python/delphi_utils/geomap.py @@ -1,74 +1,106 @@ """Contains geographic mapping tools. Authors: Dmitry Shemetov @dshemetov, James Sharpnack @jsharpna, Maria Jahja -Created: 2020-06-01 - -TODO: -- use a caching utility to store the crossfiles - see: https://github.com/cmu-delphi/covidcast-indicators/issues/282 """ -# pylint: disable=too-many-lines -from os.path import join + from collections import defaultdict +from os.path import join +from typing import Iterator, List, Literal, Optional, Set, Union +import importlib_resources import pandas as pd -import pkg_resources from pandas.api.types import is_string_dtype -class GeoMapper: # pylint: disable=too-many-public-methods +class GeoMapper: """Geo mapping tools commonly used in Delphi. The GeoMapper class provides utility functions for translating between different geocodes. Supported geocodes: - - zip: zip5, a length 5 str of 0-9 with leading 0's - - fips: state code and county code, a length 5 str of 0-9 with leading 0's - - msa: metropolitan statistical area, a length 5 str of 0-9 with leading 0's - - state_code: state code, a str of 0-9 - - state_id: state id, a str of A-Z - - hrr: hospital referral region, an int 1-500 - - Mappings: - - [x] zip -> fips : population weighted - - [x] zip -> hrr : unweighted - - [x] zip -> msa : unweighted - - [x] zip -> state - - [x] zip -> hhs - - [x] zip -> population - - [x] state code -> hhs - - [x] fips -> state : unweighted - - [x] fips -> msa : unweighted - - [x] fips -> megacounty - - [x] fips -> hrr - - [x] fips -> hhs - - [x] fips -> chng-fips - - [x] chng-fips -> state : unweighted - - [x] nation - - [ ] zip -> dma (postponed) - - The GeoMapper instance loads crosswalk tables from the package data_dir. The - crosswalk tables are assumed to have been built using the geo_data_proc.py script - in data_proc/geomap. If a mapping between codes is NOT one to many, then the table has - just two colums. If the mapping IS one to many, then a third column, the weight column, - exists (e.g. zip, fips, weight; satisfying (sum(weights) where zip==ZIP) == 1). + + - zip: five characters [0-9] with leading 0's, e.g. "33626" + also known as zip5 or zip code + - fips: five characters [0-9] with leading 0's, e.g. "12057" + the first two digits are the state FIPS code and the last + three are the county FIPS code + - msa: five characters [0-9] with leading 0's, e.g. "90001" + also known as metropolitan statistical area + - state_code: two characters [0-9], e.g "06" + - state_id: two characters [A-Z], e.g "CA" + - state_name: human-readable name, e.g "California" + - state_*: we use this below to refer to the three above geocodes in aggregate + - hrr: an integer from 1-500, also known as hospital + referral region + - hhs: an integer from 1-10, also known as health and human services region + https://www.hhs.gov/about/agencies/iea/regional-offices/index.html + + Valid mappings: + + From To Population Weighted + zip fips Yes + zip hrr No + zip msa Yes + zip state_* Yes + zip hhs Yes + zip population -- + zip nation No + state_* state_* No + state_* hhs No + state_* population -- + state_* nation No + fips state_* No + fips msa No + fips megacounty No + fips hrr Yes + fips hhs No + fips chng-fips No + fips nation No + chng-fips state_* No + + Crosswalk Tables + ================ + + The GeoMapper instance loads pre-generated crosswalk tables (built by the + script in `data_proc/geomap/geo_data_proc.py`). If a mapping between codes + is one to one or many to one, then the table has just two columns. If the + mapping is one to many, then a weight column is provided, which gives the + fractional population contribution of a source_geo to the target_geo. The + weights satisfy the condition that df.groupby(from_code).sum(weight) == 1.0 + for all values of from_code. + + Aggregation + =========== + + The GeoMapper class provides functions to aggregate data from one geocode + to another. The aggregation can be a simple one-to-one mapping or a + weighted aggregation. The weighted aggregation is useful when the data + being aggregated is a population-weighted quantity, such as visits or + cases. The aggregation is done by multiplying the data columns by the + weights and summing over the data columns. Note that the aggregation does + not adjust the aggregation for missing or NA values in the data columns, + which is equivalent to a zero-fill. Example Usage - ========== + ============= The main GeoMapper object loads and stores crosswalk dataframes on-demand. - When replacing geocodes with a new one an aggregation step is performed on the data columns - to merge entries (i.e. in the case of a many to one mapping or a weighted mapping). This - requires a specification of the data columns, which are assumed to be all the columns that - are not the geocodes or the date column specified in date_col. + When replacing geocodes with a new one an aggregation step is performed on + the data columns to merge entries (i.e. in the case of a many to one + mapping or a weighted mapping). This requires a specification of the data + columns, which are assumed to be all the columns that are not the geocodes + or the date column specified in date_col. Example 1: to add a new column with a new geocode, possibly with weights: > gmpr = GeoMapper() - > df = gmpr.add_geocode(df, "fips", "zip", from_col="fips", new_col="geo_id", + > df = gmpr.add_geocode(df, "fips", "zip", + from_col="fips", new_col="geo_id", date_col="timestamp", dropna=False) - Example 2: to replace a geocode column with a new one, aggregating the data with weights: + Example 2: to replace a geocode column with a new one, aggregating the data + with weights: > gmpr = GeoMapper() - > df = gmpr.replace_geocode(df, "fips", "zip", from_col="fips", new_col="geo_id", + > df = gmpr.replace_geocode(df, "fips", "zip", + from_col="fips", new_col="geo_id", date_col="timestamp", dropna=False) """ @@ -79,7 +111,7 @@ class GeoMapper: # pylint: disable=too-many-public-methods "msa": "zip_msa_table.csv", "pop": "zip_pop.csv", "state": "zip_state_code_table.csv", - "hhs": "zip_hhs_table.csv" + "hhs": "zip_hhs_table.csv", }, "fips": { "chng-fips": "fips_chng-fips_table.csv", @@ -90,23 +122,16 @@ class GeoMapper: # pylint: disable=too-many-public-methods "state": "fips_state_table.csv", "hhs": "fips_hhs_table.csv", }, + "hhs": {"pop": "hhs_pop.csv"}, "chng-fips": {"state": "chng-fips_state_table.csv"}, "state": {"state": "state_codes_table.csv"}, - "state_code": { - "hhs": "state_code_hhs_table.csv", - "pop": "state_pop.csv" - }, - "state_id": { - "pop": "state_pop.csv" - }, - "state_name": { - "pop": "state_pop.csv" - }, - "hhs": {"pop": "hhs_pop.csv"}, + "state_code": {"hhs": "state_code_hhs_table.csv", "pop": "state_pop.csv"}, + "state_id": {"pop": "state_pop.csv"}, + "state_name": {"pop": "state_pop.csv"}, "nation": {"pop": "nation_pop.csv"}, } - def __init__(self, census_year=2020): + def __init__(self, census_year: int = 2020): """Initialize geomapper. Parameters @@ -120,37 +145,35 @@ def __init__(self, census_year=2020): # Include all unique geos from first-level and second-level keys in # CROSSWALK_FILENAMES, with a few exceptions self._geos = { - subkey for mainkey in self.CROSSWALK_FILENAMES - for subkey in self.CROSSWALK_FILENAMES[mainkey] - }.union( - set(self.CROSSWALK_FILENAMES.keys()) - ) - set(["state", "pop"]) + subkey + for mainkey in self.CROSSWALK_FILENAMES + for subkey in self.CROSSWALK_FILENAMES[mainkey] + }.union(set(self.CROSSWALK_FILENAMES.keys())) - {"state", "pop"} for from_code, to_codes in self.CROSSWALK_FILENAMES.items(): for to_code, file_path in to_codes.items(): - self._crosswalks[from_code][to_code] = \ - self._load_crosswalk_from_file(from_code, - to_code, - join(f"data/{census_year}", file_path) - ) + self._crosswalks[from_code][to_code] = self._load_crosswalk_from_file( + from_code, to_code, join("data", f"{census_year}", file_path) + ) for geo_type in self._geos: self._geo_sets[geo_type] = self._load_geo_values(geo_type) - def _load_crosswalk_from_file(self, from_code, to_code, data_path): - stream = pkg_resources.resource_stream(__name__, data_path) + def _load_crosswalk_from_file( + self, from_code: str, to_code: str, data_path: str + ) -> pd.DataFrame: + stream = importlib_resources.files(__name__) / data_path dtype = { from_code: str, to_code: str, "pop": int, "weight": float, - **{geo: str for geo in self._geos - set("nation")} + **{geo: str for geo in self._geos - set("nation")}, } - usecols = [from_code, "pop"] if to_code == "pop" else None return pd.read_csv(stream, dtype=dtype, usecols=usecols) - def _load_geo_values(self, geo_type): + def _load_geo_values(self, geo_type: str) -> Set[str]: if geo_type == "nation": return {"us"} @@ -167,7 +190,9 @@ def _load_geo_values(self, geo_type): return set(crosswalk[geo_type]) @staticmethod - def convert_fips_to_mega(data, fips_col="fips", mega_col="megafips"): + def convert_fips_to_mega( + data: pd.DataFrame, fips_col: str = "fips", mega_col: str = "megafips" + ) -> pd.DataFrame: """Convert fips or chng-fips string to a megafips string.""" data = data.copy() data[mega_col] = data[fips_col].astype(str).str.zfill(5) @@ -176,14 +201,14 @@ def convert_fips_to_mega(data, fips_col="fips", mega_col="megafips"): @staticmethod def megacounty_creation( - data, - thr_count, - thr_win_len, - thr_col="visits", - fips_col="fips", - date_col="timestamp", - mega_col="megafips", - ): + data: pd.DataFrame, + thr_count: Union[float, int], + thr_win_len: int, + thr_col: str = "visits", + fips_col: str = "fips", + date_col: str = "timestamp", + mega_col: str = "megafips", + ) -> pd.DataFrame: """Create megacounty column. Parameters @@ -205,7 +230,7 @@ def megacounty_creation( if "_thr_col_roll" in data.columns: raise ValueError("Column name '_thr_col_roll' is reserved.") - def agg_sum_iter(data): + def agg_sum_iter(data: pd.DataFrame) -> Iterator[pd.DataFrame]: data_gby = ( data[[fips_col, date_col, thr_col]] .set_index(date_col) @@ -228,16 +253,17 @@ def agg_sum_iter(data): # Conversion functions def add_geocode( - self, df, from_code, new_code, from_col=None, new_col=None, dropna=True + self, + df: pd.DataFrame, + from_code: str, + new_code: str, + from_col: Optional[str] = None, + new_col: Optional[str] = None, + dropna: bool = True, ): """Add a new geocode column to a dataframe. - Currently supported conversions: - - fips -> state_code, state_id, state_name, zip, msa, hrr, nation, hhs, chng-fips - - chng-fips -> state_code, state_id, state_name - - zip -> state_code, state_id, state_name, fips, msa, hrr, nation, hhs - - state_x -> state_y (where x and y are in {code, id, name}), nation - - state_code -> hhs, nation + See class docstring for supported geocode transformations. Parameters --------- @@ -269,8 +295,9 @@ def add_geocode( df = df.copy() from_col = from_code if from_col is None else from_col new_col = new_code if new_col is None else new_col - assert from_col != new_col, \ - f"Can't use the same column '{from_col}' for both from_col and to_col" + assert ( + from_col != new_col + ), f"Can't use the same column '{from_col}' for both from_col and to_col" state_codes = ["state_code", "state_id", "state_name"] if not is_string_dtype(df[from_col]): @@ -305,7 +332,7 @@ def add_geocode( df = df.merge(crosswalk, left_on=from_col, right_on=from_col, how="left") # Drop extra state columns - if new_code in state_codes and not from_code in state_codes: + if new_code in state_codes and from_code not in state_codes: state_codes.remove(new_code) df.drop(columns=state_codes, inplace=True) elif new_code in state_codes and from_code in state_codes: @@ -316,7 +343,9 @@ def add_geocode( return df - def _add_nation_geocode(self, df, from_code, from_col, new_col): + def _add_nation_geocode( + self, df: pd.DataFrame, from_code: str, from_col: str, new_col: str + ) -> pd.DataFrame: """Add a nation geocode column to a dataframe. See `add_geocode()` documentation for argument description. @@ -328,29 +357,24 @@ def _add_nation_geocode(self, df, from_code, from_col, new_col): return df raise ValueError( - f"Conversion to the nation level is not supported " + "Conversion to the nation level is not supported " f"from {from_code}; try {valid_from_codes}" ) def replace_geocode( self, - df, - from_code, - new_code, - from_col=None, - new_col=None, - date_col="timestamp", - data_cols=None, - dropna=True, - ): + df: pd.DataFrame, + from_code: str, + new_code: str, + from_col: Optional[str] = None, + new_col: Optional[str] = None, + date_col: Optional[str] = "timestamp", + data_cols: Optional[List[str]] = None, + dropna: bool = True, + ) -> pd.DataFrame: """Replace a geocode column in a dataframe. - Currently supported conversions: - - fips -> chng-fips, state_code, state_id, state_name, zip, msa, hrr, nation - - chng-fips -> state_code, state_id, state_name - - zip -> state_code, state_id, state_name, fips, msa, hrr, nation - - state_x -> state_y (where x and y are in {code, id, name}), nation - - state_code -> hhs, nation + See class docstring for supported geocode transformations. Parameters --------- @@ -397,13 +421,19 @@ def replace_geocode( df[data_cols] = df[data_cols].multiply(df["weight"], axis=0) df.drop("weight", axis=1, inplace=True) - if not date_col is None: + if date_col is not None: df = df.groupby([date_col, new_col]).sum(numeric_only=True).reset_index() else: df = df.groupby([new_col]).sum(numeric_only=True).reset_index() return df - def add_population_column(self, data, geocode_type, geocode_col=None, dropna=True): + def add_population_column( + self, + data: pd.DataFrame, + geocode_type: Literal["fips", "zip"], + geocode_col: Optional[str] = None, + dropna: bool = True, + ) -> pd.DataFrame: """ Append a population column to a dataframe, based on the FIPS or ZIP code. @@ -428,7 +458,15 @@ def add_population_column(self, data, geocode_type, geocode_col=None, dropna=Tru """ geocode_col = geocode_type if geocode_col is None else geocode_col data = data.copy() - supported_geos = ["fips", "zip", "state_id", "state_name", "state_code", "hhs", "nation"] + supported_geos = [ + "fips", + "zip", + "state_id", + "state_name", + "state_code", + "hhs", + "nation", + ] if geocode_type not in supported_geos: raise ValueError( f"Only {supported_geos} geocodes supported. For other codes, aggregate those." @@ -442,24 +480,22 @@ def add_population_column(self, data, geocode_type, geocode_col=None, dropna=Tru else: data[geocode_col] = data[geocode_col].astype(str) merge_type = "inner" if dropna else "left" - data_with_pop = ( - data - .merge(pop_df, left_on=geocode_col, right_on=geocode_type, how=merge_type) - .rename(columns={"pop": "population"}) - ) + data_with_pop = data.merge( + pop_df, left_on=geocode_col, right_on=geocode_type, how=merge_type + ).rename(columns={"pop": "population"}) return data_with_pop @staticmethod def fips_to_megacounty( - data, - thr_count, - thr_win_len, - thr_col="visits", - fips_col="fips", - date_col="timestamp", - mega_col="megafips", + data: pd.DataFrame, + thr_count: Union[float, int], + thr_win_len: int, + thr_col: str = "visits", + fips_col: str = "fips", + date_col: str = "timestamp", + mega_col: str = "megafips", count_cols=None, - ): + ) -> pd.DataFrame: """Convert and aggregate from FIPS or chng-fips to megaFIPS. Parameters @@ -501,7 +537,7 @@ def fips_to_megacounty( data = data.reset_index().groupby([date_col, mega_col]).sum(numeric_only=True) return data.reset_index() - def as_mapper_name(self, geo_type, state="state_id"): + def as_mapper_name(self, geo_type: str, state: str = "state_id") -> str: """ Return the mapper equivalent of a region type. @@ -513,7 +549,7 @@ def as_mapper_name(self, geo_type, state="state_id"): return "fips" return geo_type - def get_crosswalk(self, from_code, to_code): + def get_crosswalk(self, from_code: str, to_code: str) -> pd.DataFrame: """Return a dataframe mapping the given geocodes. Parameters @@ -530,9 +566,11 @@ def get_crosswalk(self, from_code, to_code): try: return self._crosswalks[from_code][to_code] except KeyError as e: - raise ValueError(f'Mapping from "{from_code}" to "{to_code}" not found.') from e + raise ValueError( + f'Mapping from "{from_code}" to "{to_code}" not found.' + ) from e - def get_geo_values(self, geo_type): + def get_geo_values(self, geo_type: str) -> Set[str]: """ Return a set of all values for a given geography type. @@ -551,13 +589,17 @@ def get_geo_values(self, geo_type): except KeyError as e: raise ValueError(f'Given geo type "{geo_type}" not found') from e - def get_geos_within(self, container_geocode, contained_geocode_type, container_geocode_type): + def get_geos_within( + self, + container_geocode: str, + contained_geocode_type: str, + container_geocode_type: str, + ) -> Set[str]: """ Return all contained regions of the given type within the given container geocode. Given container_geocode (e.g "ca" for California) of type container_geocode_type - (e.g "state"), return: - - all (contained_geocode_type)s within container_geocode + (e.g "state"), return all (contained_geocode_type)s within container_geocode. Supports these 4 combinations: - all states within a nation @@ -581,20 +623,82 @@ def get_geos_within(self, container_geocode, contained_geocode_type, container_g if contained_geocode_type == "state": if container_geocode_type == "nation" and container_geocode == "us": crosswalk = self._crosswalks["state"]["state"] - return set(crosswalk["state_id"]) # pylint: disable=unsubscriptable-object + return set(crosswalk["state_id"]) if container_geocode_type == "hhs": crosswalk_hhs = self._crosswalks["fips"]["hhs"] crosswalk_state = self._crosswalks["fips"]["state"] - fips_hhs = crosswalk_hhs[crosswalk_hhs["hhs"] == container_geocode]["fips"] - return set(crosswalk_state[crosswalk_state["fips"].isin(fips_hhs)]["state_id"]) - elif (contained_geocode_type in ("county", "fips", "chng-fips") and - container_geocode_type == "state"): + fips_hhs = crosswalk_hhs[crosswalk_hhs["hhs"] == container_geocode][ + "fips" + ] + return set( + crosswalk_state[crosswalk_state["fips"].isin(fips_hhs)]["state_id"] + ) + elif ( + contained_geocode_type in ("county", "fips", "chng-fips") + and container_geocode_type == "state" + ): contained_geocode_type = self.as_mapper_name(contained_geocode_type) crosswalk = self._crosswalks[contained_geocode_type]["state"] return set( - crosswalk[crosswalk["state_id"] == container_geocode][contained_geocode_type] + crosswalk[crosswalk["state_id"] == container_geocode][ + contained_geocode_type + ] ) - raise ValueError("(contained_geocode_type, container_geocode_type) was " - f"({contained_geocode_type}, {container_geocode_type}), but " - "must be one of (state, nation), (state, hhs), (county, state)" - ", (fips, state), (chng-fips, state)") + raise ValueError( + "(contained_geocode_type, container_geocode_type) was " + f"({contained_geocode_type}, {container_geocode_type}), but " + "must be one of (state, nation), (state, hhs), (county, state)" + ", (fips, state), (chng-fips, state)" + ) + + def aggregate_by_weighted_sum( + self, df: pd.DataFrame, to_geo: str, sensor_col: str, time_col: str, population_col: str + ) -> pd.DataFrame: + """Aggregate sensor, weighted by time-dependent population. + + Note: This function generates its own population weights and excludes + locations where the data is NA, which is effectively an extrapolation + assumption to the rest of the geos. This is in contrast to the + `replace_geocode` function, which assumes that the weights are already + present in the data and does not adjust for missing data (see the + docstring for the GeoMapper class). + + Parameters + --------- + df: pd.DataFrame + Input dataframe, assumed to have a sensor column (e.g. "visits"), a + to_geo column (e.g. "state"), and a population column (corresponding + to a from_geo, e.g. "wastewater collection site"). + to_geo: str + The column name of the geocode to aggregate to. + sensor: str + The column name of the sensor to aggregate. + population_column: str + The column name of the population to weight the sensor by. + + Returns + --------- + agg_df: pd.DataFrame + A dataframe with the aggregated sensor values, weighted by population. + """ + # Don't modify the input dataframe + df = df.copy() + # Zero-out populations where the sensor is NA + df["_zeroed_pop"] = df[population_col] * df[sensor_col].abs().notna() + # Weight the sensor by the population + df["_weighted_sensor"] = df[sensor_col] * df["_zeroed_pop"] + agg_df = ( + df.groupby([time_col, to_geo]) + .agg( + { + "_zeroed_pop": "sum", + "_weighted_sensor": lambda x: x.sum(min_count=1), + } + ).assign( + _new_sensor = lambda x: x["_weighted_sensor"] / x["_zeroed_pop"] + ).reset_index() + .rename(columns={"_new_sensor": f"weighted_{sensor_col}"}) + .drop(columns=["_zeroed_pop", "_weighted_sensor"]) + ) + + return agg_df diff --git a/_delphi_utils_python/delphi_utils/logger.py b/_delphi_utils_python/delphi_utils/logger.py index d04ff7673..d70ae4c8e 100644 --- a/_delphi_utils_python/delphi_utils/logger.py +++ b/_delphi_utils_python/delphi_utils/logger.py @@ -1,9 +1,10 @@ -"""Structured logger utility for creating JSON logs.""" +"""Structured logger utility for creating JSON logs. -# the Delphi group uses two ~identical versions of this file. -# try to keep them in sync with edits, for sanity. -# https://github.com/cmu-delphi/covidcast-indicators/blob/main/_delphi_utils_python/delphi_utils/logger.py # pylint: disable=line-too-long -# https://github.com/cmu-delphi/delphi-epidata/blob/dev/src/common/logger.py +The Delphi group uses two ~identical versions of this file. +Try to keep them in sync with edits, for sanity. + https://github.com/cmu-delphi/covidcast-indicators/blob/main/_delphi_utils_python/delphi_utils/logger.py + https://github.com/cmu-delphi/delphi-epidata/blob/dev/src/common/logger.py +""" import contextlib import logging diff --git a/_delphi_utils_python/delphi_utils/smooth.py b/_delphi_utils_python/delphi_utils/smooth.py index 503fcf1b2..d9c95b552 100644 --- a/_delphi_utils_python/delphi_utils/smooth.py +++ b/_delphi_utils_python/delphi_utils/smooth.py @@ -304,17 +304,11 @@ def left_gauss_linear_smoother(self, signal): n = len(signal) signal_smoothed = np.zeros_like(signal) # A is the regression design matrix - A = np.vstack([np.ones(n), np.arange(n)]).T # pylint: disable=invalid-name + A = np.vstack([np.ones(n), np.arange(n)]).T for idx in range(n): - weights = np.exp( - -((np.arange(idx + 1) - idx) ** 2) / self.gaussian_bandwidth - ) - AwA = np.dot( # pylint: disable=invalid-name - A[: (idx + 1), :].T * weights, A[: (idx + 1), :] - ) - Awy = np.dot( # pylint: disable=invalid-name - A[: (idx + 1), :].T * weights, signal[: (idx + 1)].reshape(-1, 1) - ) + weights = np.exp(-((np.arange(idx + 1) - idx) ** 2) / self.gaussian_bandwidth) + AwA = np.dot(A[: (idx + 1), :].T * weights, A[: (idx + 1), :]) + Awy = np.dot(A[: (idx + 1), :].T * weights, signal[: (idx + 1)].reshape(-1, 1)) try: beta = np.linalg.solve(AwA, Awy) signal_smoothed[idx] = np.dot(A[: (idx + 1), :], beta)[-1] @@ -389,9 +383,7 @@ def savgol_coeffs(self, nl, nr, poly_fit_degree): if nr > 0: warnings.warn("The filter is no longer causal.") - A = np.vstack( # pylint: disable=invalid-name - [np.arange(nl, nr + 1) ** j for j in range(poly_fit_degree + 1)] - ).T + A = np.vstack([np.arange(nl, nr + 1) ** j for j in range(poly_fit_degree + 1)]).T if self.gaussian_bandwidth is None: mat_inverse = np.linalg.inv(A.T @ A) @ A.T @@ -406,7 +398,7 @@ def savgol_coeffs(self, nl, nr, poly_fit_degree): coeffs[i] = (mat_inverse @ basis_vector)[0] return coeffs - def savgol_smoother(self, signal): # pylint: disable=inconsistent-return-statements + def savgol_smoother(self, signal): """Smooth signal with the savgol smoother. Returns a convolution of the 1D signal with the Savitzky-Golay coefficients, respecting diff --git a/_delphi_utils_python/delphi_utils/validator/dynamic.py b/_delphi_utils_python/delphi_utils/validator/dynamic.py index 6758086ab..9bc72ec1c 100644 --- a/_delphi_utils_python/delphi_utils/validator/dynamic.py +++ b/_delphi_utils_python/delphi_utils/validator/dynamic.py @@ -320,7 +320,6 @@ def create_dfs(self, geo_sig_df, api_df_or_error, checking_date, geo_type, signa # # These variables are interpolated into the call to `api_df_or_error.query()` # below but pylint doesn't recognize that. - # pylint: disable=unused-variable reference_start_date = recent_cutoff_date - self.params.max_check_lookbehind if signal_type in self.params.smoothed_signals: # Add an extra 7 days to the reference period. @@ -328,7 +327,6 @@ def create_dfs(self, geo_sig_df, api_df_or_error, checking_date, geo_type, signa timedelta(days=7) reference_end_date = recent_cutoff_date - timedelta(days=1) - # pylint: enable=unused-variable # Subset API data to relevant range of dates. reference_api_df = api_df_or_error.query( diff --git a/_delphi_utils_python/delphi_utils/weekday.py b/_delphi_utils_python/delphi_utils/weekday.py index ba5d75815..6e8f23786 100644 --- a/_delphi_utils_python/delphi_utils/weekday.py +++ b/_delphi_utils_python/delphi_utils/weekday.py @@ -12,12 +12,19 @@ class Weekday: """Class to handle weekday effects.""" @staticmethod - def get_params(data, denominator_col, numerator_cols, date_col, scales, logger): + def get_params(data, denominator_col, numerator_cols, date_col, scales, logger, solver_override=None): r"""Fit weekday correction for each col in numerator_cols. Return a matrix of parameters: the entire vector of betas, for each time series column in the data. + + solver: Historically used "ECOS" but due to numerical stability issues, "CLARABEL" + (introduced in cvxpy 1.3)is now the default solver in cvxpy 1.5. """ + if solver_override is None: + solver = cp.CLARABEL + else: + solver = solver_override tmp = data.reset_index() denoms = tmp.groupby(date_col).sum()[denominator_col] nums = tmp.groupby(date_col).sum()[numerator_cols] @@ -35,7 +42,7 @@ def get_params(data, denominator_col, numerator_cols, date_col, scales, logger): # Loop over the available numerator columns and smooth each separately. for i in range(nums.shape[1]): - result = Weekday._fit(X, scales, npnums[:, i], npdenoms) + result = Weekday._fit(X, scales, npnums[:, i], npdenoms, solver) if result is None: logger.error("Unable to calculate weekday correction") else: @@ -44,7 +51,18 @@ def get_params(data, denominator_col, numerator_cols, date_col, scales, logger): return params @staticmethod - def _fit(X, scales, npnums, npdenoms): + def get_params_legacy(data, denominator_col, numerator_cols, date_col, scales, logger): + r""" + Preserves older default behavior of using the ECOS solver. + + NOTE: "ECOS" solver will not be installed by default as of cvxpy 1.6 + """ + return Weekday.get_params( + data, denominator_col, numerator_cols, date_col, scales, logger, solver_override=cp.ECOS + ) + + @staticmethod + def _fit(X, scales, npnums, npdenoms, solver): r"""Correct a signal estimated as numerator/denominator for weekday effects. The ordinary estimate would be numerator_t/denominator_t for each time point @@ -78,6 +96,8 @@ def _fit(X, scales, npnums, npdenoms): ll = (numerator * (X*b + log(denominator)) - sum(exp(X*b) + log(denominator))) / num_days + + solver: Historically use "ECOS" but due to numerical issues, "CLARABEL" is now default. """ b = cp.Variable((X.shape[1])) @@ -93,7 +113,7 @@ def _fit(X, scales, npnums, npdenoms): for scale in scales: try: prob = cp.Problem(cp.Minimize((-ll + lmbda * penalty) / scale)) - _ = prob.solve() + _ = prob.solve(solver=solver) return b.value except SolverError: # If the magnitude of the objective function is too large, an error is diff --git a/_delphi_utils_python/setup.py b/_delphi_utils_python/setup.py index 046dc5d3a..3dee89b53 100644 --- a/_delphi_utils_python/setup.py +++ b/_delphi_utils_python/setup.py @@ -2,23 +2,26 @@ from setuptools import find_packages with open("README.md", "r") as f: - long_description = f.read() + long_description = f.read() required = [ "boto3", "covidcast", "cvxpy", + "scs<3.2.6", # TODO: remove this ; it is a cvxpy dependency, and the excluded version appears to break our jenkins build. see: https://github.com/cvxgrp/scs/issues/283 + "darker[isort]~=2.1.1", "epiweeks", "freezegun", "gitpython", + "importlib_resources>=1.3", "mock", "moto~=4.2.14", "numpy", "pandas>=1.1.0", "pydocstyle", "pylint==2.8.3", - "pytest", "pytest-cov", + "pytest", "requests-mock", "slackclient", "structlog", @@ -27,7 +30,7 @@ setup( name="delphi_utils", - version="0.3.23", + version="0.3.24", description="Shared Utility Functions for Indicators", long_description=long_description, long_description_content_type="text/markdown", diff --git a/_delphi_utils_python/tests/test_archive.py b/_delphi_utils_python/tests/test_archive.py index e821e011b..589b55513 100644 --- a/_delphi_utils_python/tests/test_archive.py +++ b/_delphi_utils_python/tests/test_archive.py @@ -2,10 +2,11 @@ from io import StringIO, BytesIO from os import listdir, mkdir from os.path import join -from typing import Any, Dict, List +from typing import Dict, List from boto3 import Session -from git import Repo, exc +from git import Repo +from git.exc import InvalidGitRepositoryError import mock from moto import mock_s3 import numpy as np @@ -16,6 +17,7 @@ from delphi_utils.archive import ArchiveDiffer, GitArchiveDiffer, S3ArchiveDiffer,\ archiver_from_params from delphi_utils.nancodes import Nans +from testing import set_df_dtypes CSV_DTYPES = { "geo_id": str, "val": float, "se": float, "sample_size": float, @@ -26,20 +28,12 @@ class Example: def __init__(self, before, after, diff): def fix_df(df): if isinstance(df, pd.DataFrame): - return Example._set_df_datatypes(df, CSV_DTYPES) + return set_df_dtypes(df, CSV_DTYPES) return df self.before = fix_df(before) self.after = fix_df(after) self.diff = fix_df(diff) - @staticmethod - def _set_df_datatypes(df: pd.DataFrame, dtypes: Dict[str, Any]) -> pd.DataFrame: - df = df.copy() - for k, v in dtypes.items(): - if k in df.columns: - df[k] = df[k].astype(v) - return df - @dataclass class Expecteds: deleted: List[str] @@ -194,9 +188,6 @@ def __post_init__(self): assert set(EXPECTEDS.new) == set(f"{csv_name}.csv" for csv_name, dfs in CSVS.items() if dfs.before is None), \ "Bad programmer: added more new files to CSVS.after without updating EXPECTEDS.new" -def _assert_frames_equal_ignore_row_order(df1, df2, index_cols: List[str] = None): - return assert_frame_equal(df1.set_index(index_cols).sort_index(), df2.set_index(index_cols).sort_index()) - class ArchiveDifferTestlike: def set_up(self, tmp_path): cache_dir = join(str(tmp_path), "cache") @@ -209,10 +200,10 @@ def check_filtered_exports(self, export_dir): assert set(listdir(export_dir)) == set(EXPECTEDS.filtered_exports) for f in EXPECTEDS.filtered_exports: example = CSVS[f.replace(".csv", "")] - _assert_frames_equal_ignore_row_order( - pd.read_csv(join(export_dir, f), dtype=CSV_DTYPES), - example.after if example.diff is None else example.diff, - index_cols=["geo_id"] + example = example.after if example.diff is None else example.diff + assert_frame_equal( + pd.read_csv(join(export_dir, f), dtype=CSV_DTYPES).sort_values("geo_id", ignore_index=True), + example.sort_values("geo_id", ignore_index=True) ) class TestArchiveDiffer(ArchiveDifferTestlike): @@ -264,14 +255,13 @@ def test_diff_and_filter_exports(self, tmp_path): # Check that the diff files look as expected for key, diff_name in EXPECTEDS.common_diffs.items(): - if diff_name is None: continue - _assert_frames_equal_ignore_row_order( - pd.read_csv(join(export_dir, diff_name), dtype=CSV_DTYPES), - CSVS[key.replace(".csv", "")].diff, - index_cols=["geo_id"] + if diff_name is None: + continue + assert_frame_equal( + pd.read_csv(join(export_dir, diff_name), dtype=CSV_DTYPES).sort_values("geo_id", ignore_index=True), + CSVS[key.replace(".csv", "")].diff.sort_values("geo_id", ignore_index=True) ) - # Test filter_exports # =================== @@ -406,7 +396,7 @@ def test_init_args(self, tmp_path): GitArchiveDiffer(cache_dir, export_dir, override_dirty=False, commit_partial_success=True) - with pytest.raises(exc.InvalidGitRepositoryError): + with pytest.raises(InvalidGitRepositoryError): GitArchiveDiffer(cache_dir, export_dir) repo = Repo.init(cache_dir) diff --git a/_delphi_utils_python/tests/test_geomap.py b/_delphi_utils_python/tests/test_geomap.py index ab86c143d..c968fd359 100644 --- a/_delphi_utils_python/tests/test_geomap.py +++ b/_delphi_utils_python/tests/test_geomap.py @@ -10,10 +10,12 @@ def geomapper(): return GeoMapper(census_year=2020) + @pytest.fixture(scope="class") def geomapper_2019(): return GeoMapper(census_year=2019) + class TestGeoMapper: fips_data = pd.DataFrame( { @@ -34,7 +36,8 @@ class TestGeoMapper: fips_data_3 = pd.DataFrame( { "fips": ["48059", "48253", "48441", "72003", "72005", "10999"], - "timestamp": [pd.Timestamp("2018-01-01")] * 3 + [pd.Timestamp("2018-01-03")] * 3, + "timestamp": [pd.Timestamp("2018-01-01")] * 3 + + [pd.Timestamp("2018-01-03")] * 3, "count": [1, 2, 3, 4, 8, 5], "total": [2, 4, 7, 11, 100, 10], } @@ -58,7 +61,8 @@ class TestGeoMapper: zip_data = pd.DataFrame( { "zip": ["45140", "95616", "95618"] * 2, - "timestamp": [pd.Timestamp("2018-01-01")] * 3 + [pd.Timestamp("2018-01-03")] * 3, + "timestamp": [pd.Timestamp("2018-01-01")] * 3 + + [pd.Timestamp("2018-01-03")] * 3, "count": [99, 345, 456, 100, 344, 442], } ) @@ -132,7 +136,7 @@ class TestGeoMapper: ) # Loading tests updated 8/26 - def test_crosswalks(self, geomapper): + def test_crosswalks(self, geomapper: GeoMapper): # These tests ensure that the one-to-many crosswalks have properly normalized weights # FIPS -> HRR is allowed to be an incomplete mapping, since only a fraction of a FIPS # code can not belong to an HRR @@ -152,33 +156,32 @@ def test_crosswalks(self, geomapper): cw = geomapper.get_crosswalk(from_code="zip", to_code="hhs") assert cw.groupby("zip")["weight"].sum().round(5).eq(1.0).all() - - def test_load_zip_fips_table(self, geomapper): + def test_load_zip_fips_table(self, geomapper: GeoMapper): fips_data = geomapper.get_crosswalk(from_code="zip", to_code="fips") assert set(fips_data.columns) == set(["zip", "fips", "weight"]) assert pd.api.types.is_string_dtype(fips_data.zip) assert pd.api.types.is_string_dtype(fips_data.fips) assert pd.api.types.is_float_dtype(fips_data.weight) - def test_load_state_table(self, geomapper): + def test_load_state_table(self, geomapper: GeoMapper): state_data = geomapper.get_crosswalk(from_code="state", to_code="state") assert tuple(state_data.columns) == ("state_code", "state_id", "state_name") assert state_data.shape[0] == 60 - def test_load_fips_msa_table(self, geomapper): + def test_load_fips_msa_table(self, geomapper: GeoMapper): msa_data = geomapper.get_crosswalk(from_code="fips", to_code="msa") assert tuple(msa_data.columns) == ("fips", "msa") - def test_load_fips_chngfips_table(self, geomapper): + def test_load_fips_chngfips_table(self, geomapper: GeoMapper): chngfips_data = geomapper.get_crosswalk(from_code="fips", to_code="chng-fips") assert tuple(chngfips_data.columns) == ("fips", "chng-fips") - def test_load_zip_hrr_table(self, geomapper): + def test_load_zip_hrr_table(self, geomapper: GeoMapper): zip_data = geomapper.get_crosswalk(from_code="zip", to_code="hrr") assert pd.api.types.is_string_dtype(zip_data["zip"]) assert pd.api.types.is_string_dtype(zip_data["hrr"]) - def test_megacounty(self, geomapper): + def test_megacounty(self, geomapper: GeoMapper): new_data = geomapper.fips_to_megacounty(self.mega_data, 6, 50) assert ( new_data[["count", "visits"]].sum() @@ -204,12 +207,18 @@ def test_megacounty(self, geomapper): "count": [8, 7, 3, 10021], } ) - pd.testing.assert_frame_equal(new_data.set_index("megafips").sort_index(axis=1), expected_df.set_index("megafips").sort_index(axis=1)) + pd.testing.assert_frame_equal( + new_data.set_index("megafips").sort_index(axis=1), + expected_df.set_index("megafips").sort_index(axis=1), + ) # chng-fips should have the same behavior when converting to megacounties. mega_county_groups = self.mega_data_3.copy() - mega_county_groups.fips.replace({1125:"01g01"}, inplace = True) + mega_county_groups.fips.replace({1125: "01g01"}, inplace=True) new_data = geomapper.fips_to_megacounty(self.mega_data_3, 4, 1) - pd.testing.assert_frame_equal(new_data.set_index("megafips").sort_index(axis=1), expected_df.set_index("megafips").sort_index(axis=1)) + pd.testing.assert_frame_equal( + new_data.set_index("megafips").sort_index(axis=1), + expected_df.set_index("megafips").sort_index(axis=1), + ) new_data = geomapper.fips_to_megacounty(self.mega_data_3, 4, 1, thr_col="count") expected_df = pd.DataFrame( @@ -220,14 +229,20 @@ def test_megacounty(self, geomapper): "count": [6, 5, 7, 10021], } ) - pd.testing.assert_frame_equal(new_data.set_index("megafips").sort_index(axis=1), expected_df.set_index("megafips").sort_index(axis=1)) + pd.testing.assert_frame_equal( + new_data.set_index("megafips").sort_index(axis=1), + expected_df.set_index("megafips").sort_index(axis=1), + ) # chng-fips should have the same behavior when converting to megacounties. mega_county_groups = self.mega_data_3.copy() - mega_county_groups.fips.replace({1123:"01g01"}, inplace = True) + mega_county_groups.fips.replace({1123: "01g01"}, inplace=True) new_data = geomapper.fips_to_megacounty(self.mega_data_3, 4, 1, thr_col="count") - pd.testing.assert_frame_equal(new_data.set_index("megafips").sort_index(axis=1), expected_df.set_index("megafips").sort_index(axis=1)) + pd.testing.assert_frame_equal( + new_data.set_index("megafips").sort_index(axis=1), + expected_df.set_index("megafips").sort_index(axis=1), + ) - def test_add_population_column(self, geomapper): + def test_add_population_column(self, geomapper: GeoMapper): new_data = geomapper.add_population_column(self.fips_data_3, "fips") assert new_data.shape == (5, 5) new_data = geomapper.add_population_column(self.zip_data, "zip") @@ -245,14 +260,18 @@ def test_add_population_column(self, geomapper): new_data = geomapper.add_population_column(self.nation_data, "nation") assert new_data.shape == (1, 3) - def test_add_geocode(self, geomapper): + def test_add_geocode(self, geomapper: GeoMapper): # state_code -> nation new_data = geomapper.add_geocode(self.zip_data, "zip", "state_code") new_data2 = geomapper.add_geocode(new_data, "state_code", "nation") assert new_data2["nation"].unique()[0] == "us" new_data = geomapper.replace_geocode(self.zip_data, "zip", "state_code") - new_data2 = geomapper.add_geocode(new_data, "state_code", "state_id", new_col="state") - new_data3 = geomapper.replace_geocode(new_data2, "state_code", "nation", new_col="geo_id") + new_data2 = geomapper.add_geocode( + new_data, "state_code", "state_id", new_col="state" + ) + new_data3 = geomapper.replace_geocode( + new_data2, "state_code", "nation", new_col="geo_id" + ) assert "state" not in new_data3.columns # state_code -> hhs @@ -264,11 +283,15 @@ def test_add_geocode(self, geomapper): new_data = geomapper.replace_geocode(self.zip_data, "zip", "state_name") new_data2 = geomapper.add_geocode(new_data, "state_name", "state_id") assert new_data2.shape == (4, 5) - new_data2 = geomapper.replace_geocode(new_data, "state_name", "state_id", new_col="abbr") + new_data2 = geomapper.replace_geocode( + new_data, "state_name", "state_id", new_col="abbr" + ) assert "abbr" in new_data2.columns # fips -> nation - new_data = geomapper.replace_geocode(self.fips_data_5, "fips", "nation", new_col="NATION") + new_data = geomapper.replace_geocode( + self.fips_data_5, "fips", "nation", new_col="NATION" + ) pd.testing.assert_frame_equal( new_data, pd.DataFrame().from_dict( @@ -278,15 +301,25 @@ def test_add_geocode(self, geomapper): "count": {0: 10024.0}, "total": {0: 100006.0}, } - ) + ), ) # fips -> chng-fips new_data = geomapper.add_geocode(self.fips_data_5, "fips", "chng-fips") - assert sorted(list(new_data["chng-fips"])) == ['01123', '18181', '48g19', '72003'] + assert sorted(list(new_data["chng-fips"])) == [ + "01123", + "18181", + "48g19", + "72003", + ] assert new_data["chng-fips"].size == self.fips_data_5.fips.size new_data = geomapper.replace_geocode(self.fips_data_5, "fips", "chng-fips") - assert sorted(list(new_data["chng-fips"])) == ['01123', '18181', '48g19', '72003'] + assert sorted(list(new_data["chng-fips"])) == [ + "01123", + "18181", + "48g19", + "72003", + ] assert new_data["chng-fips"].size == self.fips_data_5.fips.size # chng-fips -> state_id @@ -294,12 +327,12 @@ def test_add_geocode(self, geomapper): new_data2 = geomapper.add_geocode(new_data, "chng-fips", "state_id") assert new_data2["state_id"].unique().size == 4 assert new_data2["state_id"].size == self.fips_data_5.fips.size - assert sorted(list(new_data2["state_id"])) == ['al', 'in', 'pr', 'tx'] + assert sorted(list(new_data2["state_id"])) == ["al", "in", "pr", "tx"] new_data2 = geomapper.replace_geocode(new_data, "chng-fips", "state_id") assert new_data2["state_id"].unique().size == 4 assert new_data2["state_id"].size == 4 - assert sorted(list(new_data2["state_id"])) == ['al', 'in', 'pr', 'tx'] + assert sorted(list(new_data2["state_id"])) == ["al", "in", "pr", "tx"] # zip -> nation new_data = geomapper.replace_geocode(self.zip_data, "zip", "nation") @@ -315,7 +348,7 @@ def test_add_geocode(self, geomapper): "count": {0: 900, 1: 886}, "total": {0: 1800, 1: 1772}, } - ) + ), ) # hrr -> nation @@ -324,53 +357,84 @@ def test_add_geocode(self, geomapper): new_data2 = geomapper.replace_geocode(new_data, "hrr", "nation") # fips -> hrr (dropna=True/False check) - assert not geomapper.add_geocode(self.fips_data_3, "fips", "hrr").isna().any().any() - assert geomapper.add_geocode(self.fips_data_3, "fips", "hrr", dropna=False).isna().any().any() + assert ( + not geomapper.add_geocode(self.fips_data_3, "fips", "hrr") + .isna() + .any() + .any() + ) + assert ( + geomapper.add_geocode(self.fips_data_3, "fips", "hrr", dropna=False) + .isna() + .any() + .any() + ) # fips -> zip (date_col=None chech) - new_data = geomapper.replace_geocode(self.fips_data_5.drop(columns=["timestamp"]), "fips", "hrr", date_col=None) + new_data = geomapper.replace_geocode( + self.fips_data_5.drop(columns=["timestamp"]), "fips", "hrr", date_col=None + ) pd.testing.assert_frame_equal( new_data, pd.DataFrame().from_dict( { - 'hrr': {0: '1', 1: '183', 2: '184', 3: '382', 4: '7'}, - 'count': {0: 1.772347174163783, 1: 7157.392403522299, 2: 2863.607596477701, 3: 1.0, 4: 0.22765282583621685}, - 'total': {0: 3.544694348327566, 1: 71424.64801363471, 2: 28576.35198636529, 3: 1.0, 4: 0.4553056516724337} + "hrr": {0: "1", 1: "183", 2: "184", 3: "382", 4: "7"}, + "count": { + 0: 1.772347174163783, + 1: 7157.392403522299, + 2: 2863.607596477701, + 3: 1.0, + 4: 0.22765282583621685, + }, + "total": { + 0: 3.544694348327566, + 1: 71424.64801363471, + 2: 28576.35198636529, + 3: 1.0, + 4: 0.4553056516724337, + }, } - ) + ), ) # fips -> hhs - new_data = geomapper.replace_geocode(self.fips_data_3.drop(columns=["timestamp"]), - "fips", "hhs", date_col=None) + new_data = geomapper.replace_geocode( + self.fips_data_3.drop(columns=["timestamp"]), "fips", "hhs", date_col=None + ) pd.testing.assert_frame_equal( new_data, pd.DataFrame().from_dict( { "hhs": {0: "2", 1: "6"}, "count": {0: 12, 1: 6}, - "total": {0: 111, 1: 13} + "total": {0: 111, 1: 13}, } - ) + ), ) # zip -> hhs new_data = geomapper.replace_geocode(self.zip_data, "zip", "hhs") - new_data = new_data.round(10) # get rid of a floating point error with 99.00000000000001 + new_data = new_data.round( + 10 + ) # get rid of a floating point error with 99.00000000000001 pd.testing.assert_frame_equal( new_data, pd.DataFrame().from_dict( { - "timestamp": {0: pd.Timestamp("2018-01-01"), 1: pd.Timestamp("2018-01-01"), - 2: pd.Timestamp("2018-01-03"), 3: pd.Timestamp("2018-01-03")}, + "timestamp": { + 0: pd.Timestamp("2018-01-01"), + 1: pd.Timestamp("2018-01-01"), + 2: pd.Timestamp("2018-01-03"), + 3: pd.Timestamp("2018-01-03"), + }, "hhs": {0: "5", 1: "9", 2: "5", 3: "9"}, "count": {0: 99.0, 1: 801.0, 2: 100.0, 3: 786.0}, - "total": {0: 198.0, 1: 1602.0, 2: 200.0, 3: 1572.0} + "total": {0: 198.0, 1: 1602.0, 2: 200.0, 3: 1572.0}, } - ) + ), ) - def test_get_geos(self, geomapper): + def test_get_geos(self, geomapper: GeoMapper): assert geomapper.get_geo_values("nation") == {"us"} assert geomapper.get_geo_values("hhs") == set(str(i) for i in range(1, 11)) assert len(geomapper.get_geo_values("fips")) == 3293 @@ -378,20 +442,114 @@ def test_get_geos(self, geomapper): assert len(geomapper.get_geo_values("state_id")) == 60 assert len(geomapper.get_geo_values("zip")) == 32976 - def test_get_geos_2019(self, geomapper_2019): + def test_get_geos_2019(self, geomapper_2019: GeoMapper): assert len(geomapper_2019.get_geo_values("fips")) == 3292 assert len(geomapper_2019.get_geo_values("chng-fips")) == 2710 - def test_get_geos_within(self, geomapper): - assert len(geomapper.get_geos_within("us","state","nation")) == 60 - assert len(geomapper.get_geos_within("al","county","state")) == 68 - assert len(geomapper.get_geos_within("al","fips","state")) == 68 - assert geomapper.get_geos_within("al","fips","state") == geomapper.get_geos_within("al","county","state") - assert len(geomapper.get_geos_within("al","chng-fips","state")) == 66 - assert len(geomapper.get_geos_within("4","state","hhs")) == 8 - assert geomapper.get_geos_within("4","state","hhs") == {'al', 'fl', 'ga', 'ky', 'ms', 'nc', "tn", "sc"} + def test_get_geos_within(self, geomapper: GeoMapper): + assert len(geomapper.get_geos_within("us", "state", "nation")) == 60 + assert len(geomapper.get_geos_within("al", "county", "state")) == 68 + assert len(geomapper.get_geos_within("al", "fips", "state")) == 68 + assert geomapper.get_geos_within( + "al", "fips", "state" + ) == geomapper.get_geos_within("al", "county", "state") + assert len(geomapper.get_geos_within("al", "chng-fips", "state")) == 66 + assert len(geomapper.get_geos_within("4", "state", "hhs")) == 8 + assert geomapper.get_geos_within("4", "state", "hhs") == { + "al", + "fl", + "ga", + "ky", + "ms", + "nc", + "tn", + "sc", + } - def test_census_year_pop(self, geomapper, geomapper_2019): + def test_census_year_pop(self, geomapper: GeoMapper, geomapper_2019: GeoMapper): df = pd.DataFrame({"fips": ["01001"]}) assert geomapper.add_population_column(df, "fips").population[0] == 56145 assert geomapper_2019.add_population_column(df, "fips").population[0] == 55869 + + def test_aggregate_by_weighted_sum(self, geomapper: GeoMapper): + df = pd.DataFrame( + { + "timestamp": [0] * 7, + "state": ["al", "al", "ca", "ca", "nd", "me", "me"], + "a": [1, 2, 3, 4, 12, -2, 2], + "b": [5, 6, 7, np.nan, np.nan, -1, -2], + "population_served": [10, 5, 8, 1, 3, 1, 2], + } + ) + agg_df = geomapper.aggregate_by_weighted_sum( + df, + to_geo="state", + sensor_col="a", + time_col="timestamp", + population_col="population_served", + ) + agg_df_by_hand = pd.DataFrame( + { + "timestamp": [0] * 4, + "state": ["al", "ca", "me", "nd"], + "weighted_a": [ + (1 * 10 + 2 * 5) / 15, + (3 * 8 + 4 * 1) / 9, + (-2 * 1 + 2 * 2) / 3, + (12 * 3) / 3, + ], + } + ) + pd.testing.assert_frame_equal(agg_df, agg_df_by_hand) + agg_df = geomapper.aggregate_by_weighted_sum( + df, + to_geo="state", + sensor_col="b", + time_col="timestamp", + population_col="population_served", + ) + agg_df_by_hand = pd.DataFrame( + { + "timestamp": [0] * 4, + "state": ["al", "ca", "me", "nd"], + "weighted_b": [ + (5 * 10 + 6 * 5) / 15, + (7 * 8 + 4 * 0) / 8, + (-1 * 1 + -2 * 2) / 3, + (np.nan) / 3, + ], + } + ) + pd.testing.assert_frame_equal(agg_df, agg_df_by_hand) + + df = pd.DataFrame( + { + "state": [ + "al", + "al", + "ca", + "ca", + "nd", + ], + "nation": ["us"] * 5, + "timestamp": [0] * 3 + [1] * 2, + "a": [1, 2, 3, 4, 12], + "b": [5, 6, 7, np.nan, np.nan], + "population_served": [10, 5, 8, 1, 3], + } + ) + agg_df = geomapper.aggregate_by_weighted_sum( + df, + to_geo="nation", + sensor_col="a", + time_col="timestamp", + population_col="population_served", + ) + agg_df_by_hand = pd.DataFrame( + { + "timestamp": [0, 1], + "nation": ["us"] * 2, + "weighted_a": [(1 * 10 + 2 * 5 + 3 * 8) / 23, (1 * 4 + 3 * 12) / 4], + } + ) + pd.testing.assert_frame_equal(agg_df, agg_df_by_hand) diff --git a/_delphi_utils_python/tests/test_weekday.py b/_delphi_utils_python/tests/test_weekday.py index 52e6f4f7e..adb3fbfae 100644 --- a/_delphi_utils_python/tests/test_weekday.py +++ b/_delphi_utils_python/tests/test_weekday.py @@ -18,6 +18,18 @@ def test_get_params(self): result = Weekday.get_params(self.TEST_DATA, "den", ["num"], "date", [1], TEST_LOGGER) print(result) + expected_result = np.array([[-0.05990542, -0.07272124, -0.05618539, + 0.0343087, 0.1253007, 0.04562494, + -2.27662546, -1.8956484, -1.56959677, + -1.29847058, -1.08226981, -0.92099449, + -0.81464459, -0.76322013, -0.7667211,-0.8251475]]) + assert np.allclose(result, expected_result) + + def test_get_params_legacy(self): + TEST_LOGGER = logging.getLogger() + + result = Weekday.get_params_legacy(self.TEST_DATA, "den", ["num"], "date", [1], TEST_LOGGER) + print(result) expected_result = [ -0.05993665, -0.0727396, @@ -71,4 +83,4 @@ def test_calc_adjustment(self): # The date and "den" column are unchanged by this function assert np.allclose(result["num"].values, expected_nums) assert np.allclose(result["den"].values, self.TEST_DATA["den"].values) - assert np.array_equal(result["date"].values, self.TEST_DATA["date"].values) \ No newline at end of file + assert np.array_equal(result["date"].values, self.TEST_DATA["date"].values) diff --git a/_delphi_utils_python/tests/testing.py b/_delphi_utils_python/tests/testing.py new file mode 100644 index 000000000..7e8f55e90 --- /dev/null +++ b/_delphi_utils_python/tests/testing.py @@ -0,0 +1,23 @@ +"""Common utilities for testing functions.""" +from typing import Any, Dict +import pandas as pd + + +def check_valid_dtype(dtype): + """Check if a dtype is a valid Pandas type.""" + try: + pd.api.types.pandas_dtype(dtype) + except TypeError as e: + raise ValueError(f"Invalid dtype {dtype}") from e + + +def set_df_dtypes(df: pd.DataFrame, dtypes: Dict[str, Any]) -> pd.DataFrame: + """Set the dataframe column datatypes.""" + for d in dtypes.values(): + check_valid_dtype(d) + + df = df.copy() + for k, v in dtypes.items(): + if k in df.columns: + df[k] = df[k].astype(v) + return df diff --git a/_template_python/INDICATOR_DEV_GUIDE.md b/_template_python/INDICATOR_DEV_GUIDE.md new file mode 100644 index 000000000..d003b6696 --- /dev/null +++ b/_template_python/INDICATOR_DEV_GUIDE.md @@ -0,0 +1,448 @@ +# Pipeline Development Manual + + +## A step-by-step guide to writing a pipeline + +TODO: + +* Geomapper guide +* Setting up development environment +* Deployment guide +* Manual for R? + + +## Introduction + +This document provides a comprehensive guide on how to write a data pipeline in Python for the Delphi group. +It focuses on various aspects of building a pipeline, including ingestion, transformation, and storage. +This document assumes basic knowledge of Python and a familiarity with Delphi’s data processing practices. +Throughout the manual, we will use various python libraries to demonstrate how to build a data pipeline that can handle large volumes of data efficiently. +We will also discuss best practices for building reliable, scalable, and maintainable data pipelines. + +### Related documents: + +[Adding new API endpoints](https://cmu-delphi.github.io/delphi-epidata/new_endpoint_tutorial.html) (of which COVIDcast is a single example). + +Most new data sources will be added as indicators within the main endpoint (called COVIDcast as of 2024-06-28). +In rare cases, it may be preferable to add a dedicated endpoint for a new indicator. +This would mainly be done if the format of the new data weren't compatible with the format used by the main endpoint, for example, if an indicator reports the same signal for many demographic groups, or if the reported geographic levels are nonstandard in some way. + +[Setting up an S3 ArchiveDiffer](https://docs.google.com/document/d/1VcnvfeiO-GUUf88RosmNUfiPMoby-SnwH9s12esi4sI/edit#heading=h.e4ul15t3xmfj). Archive differs are used to compress data that has a long history that doesn't change that much. For example, the JHU CSSE indicator occasionally had revisions that could go back far in time, which meant that we needed to output all reference dates every day. Revisions didn't impact every location or reference date at a time, which meant that every issue would contain many values that were exactly the same as values issued the previous day. The archive differ removes those duplicates. + +[Indicator debugging guide](https://docs.google.com/document/d/1vaNgQ2cDrMvAg0FbSurbCemF9WqZVrirPpWEK0RdATQ/edit): somewhat out-of-date but might still be useful + + +## Basic steps of an indicator + +This is the general extract-transform-load procedure used by all COVIDcast indicators: + +1. Download data from the source. + * This could be via an [API query](https://github.com/cmu-delphi/covidcast-indicators/blob/fe39ebb1f8baa76670eb665d1dc99376ddfd3010/nssp/delphi_nssp/pull.py#L30), scraping a website, [an SFTP](https://github.com/cmu-delphi/covidcast-indicators/blob/fe39ebb1f8baa76670eb665d1dc99376ddfd3010/changehc/delphi_changehc/download_ftp_files.py#L19) or S3 dropbox, an email attachment, etc. +2. Process the source data to extract one or more time-series signals. + * A signal includes a value, standard deviation (data-dependent), and sample size (data-dependent) for each region for each unit of time (a day or an epidemiological week "epi-week"). +3. Aggregate each signal to all possible standard higher geographic levels. + * For example, we generate data at the state level by combining data at the county level. +4. Output each signal into a set of CSV files with a fixed format. +5. Run a set of checks on the output. + * This ensures output will be accepted by the acquisition code and hunts for common signs of buggy code or bad source data. +6. (Data-dependent) Compare today's output with a cached version of what's currently in the API. + * This converts dense output to a diff and reduces the size of each update. +7. Deliver the CSV output files to the `receiving/` directory on the API server. + +Adding a new indicator typically means implementing steps 1-3. Step 4 is included via the function ` create_export_csv`. Steps 5 (the validator), 6 (the archive differ) and 7 (acquisition) are all handled by runners in production. +## Step 0: Keep revision history (important!) + +If the data provider doesn’t provide or it is unclear if they provide historical versions of the data, immediately set up a script (bash, Python, etc) to automatically (e.g. cron) download the data every day and save locally with versioning. + +This step has a few goals: + +1. Determine if the data is revised over time +2. Understand the revision behavior in detail +3. If the data is revised, we want to save all possible versions, even before our pipeline is fully up + +The data should be saved in _raw_ form – do not do any processing. +Our own processing (cleaning, aggregation, normalization, etc) of the data may change as the pipeline code develops and doing any processing up front could make the historical data incompatible with the final procedure. + +Check back in a couple weeks to compare data versions for revisions. + + +## Step 1: Exploratory Analysis + +The goal for exploratory analysis is to decide how the dataset does and does not fit our needs. +This information will be used in the [indicator documentation](https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html) and will warn us about potential difficulties in the pipeline, so this should be done thoroughly! Your goal is to become an expert on the ins and outs of the data source. + +While some of this process might have been done already (i.e. +it was already decided that the data is useful), it is still important to understand the properties of the dataset. +The main objective during this stage is to understand what the dataset looks like in its raw format, establish what transformations need to be done, and create a basic roadmap to accomplish all later setup tasks. + +**What you want to establish:** + +* Data fields that correspond to signals we want to report +* Reporting lag and schedule +* Backfill behavior +* Sample size +* Georesolution +* Limitations + +Jupyter notebooks work particularly well for exploratory analysis but feel free to use whatever IDE/methodology works best for you. +Some of this analysis may be useful during statistical review later, so save your code! + +If anything unusual comes up, discuss with the stakeholder (usually the original requestor of the data source, can also be [@RoniRos](https://www.github.com/RoniRos)). +The goal is to figure out how to handle any issues before getting into the details of implementation. + +### Fetching the data + +Download the data in whatever format suits you. +A one-off manual download is fine. +Don’t worry too much about productionizing the data-fetching step at this point. +(Although any code you write can be used later.) + +Also check to see whether the data is coming from an existing source, e.g. NSSP and NCHS are accessed the same way, so when adding NSSP, we could reuse the API key and only needed to lightly modify the API calls for the new dataset. + +Reading from a local file: + +```{python} +import pandas as pd +df = pd.read_csv('/Users/lukeneureiter/Downloads/luke_cpr_test.csv') +``` +Fetching from Socrata: + +```{python} +import os +from sodapy import Socrata +token = os.environ.get("SODAPY_APPTOKEN") +client = Socrata("data.cdc.gov", token) +results = client.get("rdmq-nq56", limit=10**10) +df = pd.DataFrame.from_records(results, coerce_float=True) +``` + +### Detailed questions to answer + +At this stage we want to answer the questions below (and any others that seem relevant) and consider how we might use the data before we determine that the source should become a pipeline. + +* What raw signals are available in the data? + * If the raw signals aren’t useful themselves, what useful signals could we create from these? + * Discuss with the data requestor or consult the data request GitHub issue which signals they are interested in. + If there are multiple potential signals, are there any known pros/cons of each one? + * For each signal, we want to report a value, standard error (data-dependent), and sample size (data-dependent) for each region for each unit of time. + Sample size is sometimes available as a separate “counts” signal. +* Are the signals available across different geographies? Can values be [meaningfully compared](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/google-symptoms.html#limitations) between locations? + * Ideally, we want to report data at [county, MSA, HRR, state, HHS, and nation levels](https://cmu-delphi.github.io/delphi-epidata/api/covidcast_geography.html) (US) or subregion level 2 (county, parish, etc), subregion level 1 (state, province, territory), and nation levels for other countries. + Some data sources report these levels themselves. + For those that don’t, we use the [`geomapper`](https://github.com/cmu-delphi/covidcast-indicators/blob/84d059751b646c0075f1a384741f2c1d80981269/_delphi_utils_python/delphi_utils/geomap.py) to aggregate up from smaller to larger geo types. + For that tool to work, signals must be aggregatable (i.e. + values have to be comparable between geos) and the data must be reported at supported geo types or at geo types that are mappable to supported geo types. +* What geographies might be included that are not standard? + * For example, some data sources report NYC as separate from New York State. + * Others require special handling: D.C. and territories (Puerto Rico, Guam, U.S. Virgin Islands). + * ! Sampling site, facility, or other data-specific or proprietary geographic division + * The data may not be appropriate for inclusion in the main endpoint (called COVIDcast as of 20240628). + Talk to [@dshemetov](https://www.github.com/dshemetov) (geomapper), [@melange396](https://www.github.com/melange396) (epidata, DB), and [@RoniRos](https://www.github.com/RoniRos) (PI) for discussion. + * Should the data have its own endpoint? + * Consider creating a PRD ([here](https://drive.google.com/drive/u/1/folders/155cGrc9Y7NWwygslCcU8gjL2AQbu5rFF) or [here](https://drive.google.com/drive/u/1/folders/13wUoIl-FjjCkbn2O8qH1iXOCBo2eF2-d)) to present design options. +* What is the sample size? Is this a privacy concern for us or for the data provider? +* How often is data missing? + * For privacy, some sources only report when sample size is above a certain threshold + * Missingness due to reporting pattern (e.g. no weekend reports)? + * Will we want to and is it feasible to [interpolate missing values](https://github.com/cmu-delphi/covidcast-indicators/issues/1539)? +* Are there any aberrant values that don’t make sense? e.g. negative counts, out of range percentages, “manual” missingness codes (9999, -9999, etc) +* Does the data source revise their data? How often? By how much? Is the revision meaningful, or an artifact of data processing methods? + * See raw data saved in [Step 0](#step-0-keep-revision-history-important) +* What is the reporting schedule of the data? +* What order of magnitude is the signal? (If it’s too small or too large, [this issue on how rounding is done](https://github.com/cmu-delphi/covidcast-indicators/issues/1945) needs to be addressed first) +* How is the data processed by the data source? E.g. normalization, censoring values with small sample sizes, censoring values associated with low-population areas, smoothing, adding jitter, etc. + Keep any code and notes around! They will be helpful for later steps. + For any issues that come up, consider now if + * We’ve seen them before in another dataset and, if so, how we handled it. + Is there code around that we can reuse? + * If it’s a small issue, how would you address it? Do you need an extra function to handle it? + * If it’s a big issue, talk to others and consider making a PRD to present potential solutions. + + +## Step 2: Pipeline Code + +Now that we know the substance and dimensions of our data, we can start planning the pipeline code. + +### Logic overview + +Broadly speaking, the objective here is to create a script that will download data, transform it (mainly by aggregating it to different geo levels), format it to match our standard format, and save the transformed data to the [receiving directory](https://github.com/cmu-delphi/covidcast-indicators/blob/d36352b/ansible/templates/changehc-params-prod.json.j2#L3) as a CSV. +The indicator, [validation](https://github.com/cmu-delphi/covidcast-indicators/tree/6912077acba97e835aff7d0cd3d64309a1a9241d/_delphi_utils_python/delphi_utils/validator) (a series of quality checks), and [archive diffing](https://github.com/cmu-delphi/covidcast-indicators/blob/6912077acba97e835aff7d0cd3d64309a1a9241d/_delphi_utils_python/delphi_utils/archive.py) (compressing the data by only outputting rows changed between data versions) are run via the runner. +Acquisition (ingestion of files from the receiving directory and into the database) is run separately (see the [`delphi-epidata repo`](https://github.com/cmu-delphi/delphi-epidata/tree/c65d8093d9e8fed97b3347e195cc9c40c1a5fcfa)). + +`params.json.template` is copied to `params.json` during a run. +`params.json` is used to set parameters that modify a run and that we expect we’ll want to change in the future e.g. date range to generate) or need to be obfuscated (e.g. API key). + +Each indicator includes a makefile (using GNU make), which provides predefined routines for local setup, testing, linting, and running the indicator. +At the moment, the makefiles use python 3.8.15+. + +### Development + +To get started, Delphi has a [basic code template](https://github.com/cmu-delphi/covidcast-indicators/tree/6f46f2b4a0cf86137fda5bd58025997647c87b46/_template_python) that you should copy into a top-level directory in the [`covidcast-indicators` repo](https://github.com/cmu-delphi/covidcast-indicators/). +It can also be helpful to read through other indicators, especially if they share a data source or format. + +Indicators should be written in python for speed and maintainability. +Don't use R. + +Generally, indicators have: + +* `run.py`: Run through all the pipeline steps. + Loops over all geo type-signal combinations we want to produce. + Handles logging and saving to CSV using functions from [`delphi_utils`](https://github.com/cmu-delphi/covidcast-indicators/tree/6912077acba97e835aff7d0cd3d64309a1a9241d/_delphi_utils_python/delphi_utils). +* `pull.py`: Fetch the data from the data source and do basic processing (e.g. drop unnecessary columns). + Advanced processing (e.g. sensorization) should go elsewhere. +* `geo.py`: Do geo-aggregation. + This tends to be simple wrappers around [`delphi_utils.geomapper`](https://github.com/cmu-delphi/covidcast-indicators/blob/6912077acba97e835aff7d0cd3d64309a1a9241d/_delphi_utils_python/delphi_utils/geomap.py) functions. + Do other geo handling (e.g. finding and reporting DC as a state). +* `constants.py`: Lists of geos to produce, signals to produce, dataset ids, data source URL, etc. + +Your code should be _extensively_ commented! Especially note sections where you took an unusual approach (make sure to say why and consider briefly discussing alternate approaches). + +#### Function stubs + +If you have many functions you want to implement and/or anticipate a complex pipeline, consider starting with [function stubs](https://en.wikipedia.org/wiki/Method_stub) with comments or pseudo code. +Bonus: consider writing unit tests upfront based on the expected behavior of each function. + +Some stubs to consider: + +* Retrieve a list of filenames +* Download one data file (API call, csv reader, etc.) +* Iterate through filenames to download all data files +* Construct an SQL query +* Run an SQL query +* Keep a list of columns +* Geographic transformations (tend to be wrappers around [`delphi_utils.geomapper`](https://github.com/cmu-delphi/covidcast-indicators/blob/6912077acba97e835aff7d0cd3d64309a1a9241d/_delphi_utils_python/delphi_utils/geomap.py) functions) + +Example stub: + +```{python} +def api_call(args) + #implement api call + return df +``` + +Next, populate the function stubs with the intention of using them for a single pre-defined run (ignoring params.json, other geo levels, etc). +If you fetched data programmatically in Step 0, you can reuse that in your data-fetching code. +If you reformatted data in Step 1, you can reuse that too. +Below is an example of the function stub that has been populated with code for a one-off run. + +```{python} +def api_call(token: str): + client = Socrata('healthdata.gov', token) + results = client.get("di4u-7yu6", limit=5000) + results_df = pd.DataFrame.from_records(results) + return results_df +``` + +After that, generalize your code to be able to be run on all geos of interest, take settings from params.json, use constants for easy maintenance, with extensive documentation, etc. + +#### Development environment + +Make sure you have a functional environment with python 3.8.15+. +For local runs, the makefile’s make install target will set up a local virtual environment with necessary packages. + +(If working in R (very much NOT recommended), local runs can be run without a virtual environment or using the [`renv` package](https://rstudio.github.io/renv/articles/renv.html), but production runs should be set up to use Docker.) + +#### Dealing with data-types + +* Often problem encountered prior to geomapper + * Problems that can arise and how to address them +* Basic conversion + +TODO: A list of assumptions that the server makes about various columns would be helpful. +E.g. which geo values are allowed, should every valid date be present in some way, etc + +#### Dealing with geos + +In an ideal case, the data exists at one of our [already covered geos](https://cmu-delphi.github.io/delphi-epidata/api/covidcast_geography.html): + +* State: state_code (string, leftpadded to 2 digits with 0) or state_id (string) +* FIPS (state+county codes, string leftpadded to 5 digits with 0) +* ZIP +* MSA (metro statistical area, int) +* HRR (hospital referral region, int) + +If you want to map from one of these to another, the [`delphi_utils.geomapper`](https://github.com/cmu-delphi/covidcast-indicators/blob/6912077acba97e835aff7d0cd3d64309a1a9241d/_delphi_utils_python/delphi_utils/geomap.py) utility covers most cases. +A brief example of aggregating from states to hhs regions via their population: + +```{python} +from delphi_utils.geomap import GeoMapper +geo_mapper = GeoMapper() +geo_mapper.add_geocode(df, "state_id", "state_code", from_col = "state") # add codes and ids from the full names +df = geo_mapper.add_population_column(df, "state_code") # add state populations +hhs_version = geo_mapper.replace_geocode(df, "state_code","hhs", new_col = "geo_id") # aggregate to hhs regions, renaming the geo column to geo_id +``` + +This example is taken from [`hhs_hosp`](https://github.com/cmu-delphi/covidcast-indicators/blob/main/hhs_hosp/delphi_hhs/run.py); more documentation can be found in the `geomapper` class definition. + +#### Implement a Missing Value code system + +The column is described [here](https://cmu-delphi.github.io/delphi-epidata/api/missing_codes.html). + +#### Testing + +As a general rule, it helps to decompose your functions into operations for which you can write unit tests. +To run the tests, use `make test` in the top-level indicator directory. + +Unit tests are required for all functions. +Integration tests are highly desired, but may be difficult to set up depending on where the data is being fetched from. +Mocking functions are useful in this case. + +#### Naming + +Indicator and signal names need to be approved by [@RoniRos](https://www.github.com/RoniRos). +It is better to start that conversation sooner rather than later. + +The data source name as specified during an API call (e.g. in `epidatr::pub_covidcast(source = "jhu-csse", ...)`, "jhu-csse" is the data source name) should match the wildcard portion of the module name ("jhu" in `delphi_jhu`) _and_ the top-level directory name in `covidcast-indicators` ("jhu"). +(Ideally, these would all also match how we casually refer to the indicator ("JHU"), but that's hard to foresee and enforce.) + +Ideally, the indicator name should: + +* Make it easy to tell where the data is coming from +* Make it easy to tell what type of data it is and/or what is unique about it +* Be uniquely identifying enough that if we added another indicator from the same organization, we could tell the two apart +* Be fairly short +* Be descriptive + +Based on these guidelines, the `jhu-csse` indicator would be better as `jhu-csse` everywhere (module name could be `delphi_jhu_csse`), rather than having a mix of `jhu-csse` and `jhu`. + +Signal names should not be too long, but the most important feature is that they are descriptive. +If we're mirroring a processed dataset, consider keeping their signal names. + +Use the following standard tags when creating new signal names: + +* `raw`: unsmoothed, _no longer used; if no smoothing is specified the signal is assumed to be "raw"_ +* `7dav`: smoothed using a average over a rolling 7-day window; comes at the end of the name +* `smoothed`: smoothed using a more complex smoothing algorithm; comes at the end of the name +* `prop`: counts per 100k population +* `pct`: percentage between 0 and 100 +* `num`: counts, _no longer used; if no value type is specified the signal is assumed to be a count_ +* `cli`: COVID-like illness (fever, along with cough or shortness of breath or difficulty breathing) +* `ili`: influenza-like illness (fever, along with cough or sore throat) + +Using this tag dictionary, we can interpret the following signals as + +* `confirmed_admissions_influenza_1d_prop` = raw (unsmoothed) daily ("1d") confirmed influenza hospital admissions ("confirmed_admissions_influenza") per 100,000 population ("prop"). +* `confirmed_admissions_influenza_1d_prop_7dav` = the same as above, but smoothed with a 7-day moving average ("7dav"). + +### Statistical review + +The data produced by the new indicator needs to be sanity-checked. +Think of this as doing [exploratory data analysis](#step-1-exploratory-analysis) again, but on the pipeline _output_. +Some of this does overlap with work done in Step 1, but should be revisited following our processing of the data. +Aspects of this investigation will be useful to include in the signal documentation. + +The analysis doesn't need to be formatted as a report, but should be all in one place, viewable by all Delphi members, and in a format that makes it easy to comment on. +Some good options are the GitHub issue originally requesting the data source and the GitHub pull request adding the indicator. + +There is not a formal process for this, and you're free to do whatever you think is reasonable and sufficient. +A thorough analysis would cover the following topics: + +* Run the [correlations notebook](https://github.com/cmu-delphi/covidcast/blob/5f15f71/R-notebooks/cor_dashboard.Rmd) ([example output](https://cmu-delphi.github.io/covidcast/R-notebooks/signal_correlations.html#)). + * This helps evaluate the potential value of the signals for modeling. + * Choropleths give another way to plot the data to look for weird patterns. + * Good starting point for further analyses. +* Compare the new signals against pre-existing relevant signals + * For signals that are ostensibly measuring the same thing, this helps us see issues and benefits of one versus the other and how well they agree (e.g. [JHU cases vs USAFacts cases](https://github.com/cmu-delphi/covidcast-indicators/issues/991)). + * For signals that we expect to be related, we should see correlations of the right sign and magnitude. +* Plot all signals over time. + * (unlikely) Do we need to do any interpolation? + * (unlikely) Think about if we should do any filtering/cleaning, e.g. [low sample size](https://github.com/cmu-delphi/covidcast-indicators/issues/1513#issuecomment-1036326474) in covid tests causing high variability in test positivity rate. +* Plot all signals for all geos over time and space (via choropleth). + * Look for anomalies, missing geos, missing-not-at-random values, etc. + * Verify that DC and any territories are being handled as expected. +* Think about [limitations](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html#limitations), gotchas, and lag and backfill characteristics. + +[Example analysis 1](https://github.com/cmu-delphi/covidcast-indicators/pull/1495#issuecomment-1039477646), [example analysis 2](https://github.com/cmu-delphi/covidcast-indicators/issues/367#issuecomment-717415555). + +Once the analysis is complete, have the stakeholder (usually the original requestor of the data source, can also be [@RoniRos](https://www.github.com/RoniRos)) review it. + +### Documentation + +The [documentation site](https://cmu-delphi.github.io/delphi-epidata/) ([code here](https://github.com/cmu-delphi/delphi-epidata/tree/628e9655144934f3903c133b6713df4d4fcc613e/docs)) stores long-term long-form documentation pages for each indicator, including those that are inactive. + +Active and new indicators go in the [COVIDcast Main Endpoint -> Data Sources and Signals](https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html) section ([code here](https://github.com/cmu-delphi/delphi-epidata/tree/628e9655144934f3903c133b6713df4d4fcc613e/docs/api/covidcast-signals)). +A [template doc page](https://github.com/cmu-delphi/delphi-epidata/blob/628e9655144934f3903c133b6713df4d4fcc613e/docs/api/covidcast-signals/_source-template.md) is available in the same directory. + +An indicator documentation page should contain as much detail (including technical detail) as possible. +The following fields are required: + +* Description of the data source and data collection methods +* Links to the data source (organization and specific dataset(s) used) +* Links to any data source documentation you referenced +* List of signal names, descriptions, with start dates +* Prose description of how signals are calculated +* Specific math showing how signals are calculated, if unusual or complex or you like equations +* How smoothing is done, if any +* Known limitations of the data source and the final signals +* Missingness characteristics, especially if the data is missing with a pattern (on weekends, specific states, etc) +* Lag and revision characteristics +* Licensing information + +and anything else that changes how users would use or interpret the data, impacts the usability of the signal, may be difficult to discover, recommended usecases, is unusual, any gotchas about the data or the data processing approach, etc. +_More detail is better!_ + +At the time that you're writing the documentation, you are the expert on the data source and the indicator. +Making the documentation thorough and clear will make the data maximally usable for future users, and will make maintenance for Delphi easier. + +(For similar reasons, comment your code extensively!) + +## Step 3: Deployment + +* This is after we have a working one-off script +* Using Delphi utils and functionality +* What happens to the data after it gets put in `receiving/`: + +Next, the `acquisition.covidcast` component of the `delphi-epidata` codebase does the following immediately after an indicator run (you need to set acquisition job up): + +1. Look in the `receiving/` folder to see if any new data files are available. + If there are, then: + 1. Import the new data into the epimetric_full table of the epidata.covid database, filling in the columns as follows: + 1. `source`: parsed from the name of the subdirectory of `receiving/` + 2. `signal`: parsed from the filename + 3. `time_type`: parsed from the filename + 4. `time_value`: parsed from the filename + 5. `geo_type`: parsed from the filename + 6. `geo_value`: parsed from each row of the csv file + 7. `value`: parsed from each row of the csv file + 8. `se`: parsed from each row of the csv file + 9. `sample_size`: parsed from each row of the csv file + 10. `issue`: whatever now is in time_type units + 11. `lag`: the difference in time_type units from now to time_value + 12. `value_updated_timestamp`: now + 2. Update the `epimetric_latest` table with any new keys or new versions of existing keys. + +### Staging + +After developing the pipeline code, but before deploying in development, the pipeline should be run on staging for at least a week. +This involves setting up some cronicle jobs as follows: + +first the indicator run + +Then the acquisition run + +See [@korlaxxalrok](https://www.github.com/korlaxxalrok) or [@minhkhul](https://www.github.com/minhkhul) for more information. + +https://cronicle-prod-01.delphi.cmu.edu/#Schedule?sub=edit_event&id=elr5clgy6rs + +https://cronicle-prod-01.delphi.cmu.edu/#Schedule?sub=edit_event&id=elr5ctl7art + +Note the staging hostname and how the acquisition job is chained to run right after the indicator job. +Do a few test runs. + +If everything goes well (check staging db if data is ingested properly), make a prod version of the indicator run job and use that to run indicator on a daily basis. + +Another thing to do is setting up the params.json template file in accordance with how you want to run the indicator and acquisition. +Pay attention to the receiving directory, as well as how you can store credentials in vault. +Refer to [this guide](https://docs.google.com/document/d/1Bbuvtoxowt7x2_8USx_JY-yTo-Av3oAFlhyG-vXGG-c/edit#heading=h.8kkoy8sx3t7f) for more vault info. + +### Signal Documentation + +TODO + +Apparently adding to a google spreadsheet, need to talk to someone (Carlyn) about the specifics + +How to add to signal discovery app + +How to add to www-main signal dashboard + +Github page signal documentation talk to [@nmdefries](https://www.github.com/nmdefries) and [@tinatownes](https://www.github.com/tinatownes) diff --git a/_template_python/Makefile b/_template_python/Makefile index bc88f1fec..390113eef 100644 --- a/_template_python/Makefile +++ b/_template_python/Makefile @@ -17,9 +17,12 @@ install-ci: venv pip install . lint: - . env/bin/activate; pylint $(dir) + . env/bin/activate; pylint $(dir) --rcfile=../pyproject.toml . env/bin/activate; pydocstyle $(dir) +format: + . env/bin/activate; darker $(dir) + test: . env/bin/activate ;\ (cd tests && ../env/bin/pytest --cov=$(dir) --cov-report=term-missing) diff --git a/_template_python/setup.py b/_template_python/setup.py index ba1325b3c..d7bc44078 100644 --- a/_template_python/setup.py +++ b/_template_python/setup.py @@ -2,14 +2,15 @@ from setuptools import find_packages required = [ + "covidcast", + "darker[isort]~=2.1.1", + "delphi-utils", "numpy", "pandas", "pydocstyle", - "pytest", - "pytest-cov", "pylint==2.8.3", - "delphi-utils", - "covidcast" + "pytest-cov", + "pytest", ] setup( diff --git a/ansible/templates/doctor_visits-params-prod.json.j2 b/ansible/templates/doctor_visits-params-prod.json.j2 index 49188d3bb..f6edab07f 100644 --- a/ansible/templates/doctor_visits-params-prod.json.j2 +++ b/ansible/templates/doctor_visits-params-prod.json.j2 @@ -1,7 +1,7 @@ { "common": { - "export_dir": "./receiving", - "log_filename": "./doctor-visits.log" + "export_dir": "/common/covidcast/receiving/doctor-visits", + "log_filename": "/var/log/indicators/doctor-visits.log" }, "indicator": { "input_file": "./input/SYNEDI_AGG_OUTPATIENT_18052020_1455CDT.csv.gz", @@ -43,4 +43,4 @@ ] } } -} \ No newline at end of file +} diff --git a/ansible/templates/nssp-params-prod.json.j2 b/ansible/templates/nssp-params-prod.json.j2 new file mode 100644 index 000000000..b131b6130 --- /dev/null +++ b/ansible/templates/nssp-params-prod.json.j2 @@ -0,0 +1,29 @@ +{ + "common": { + "export_dir": "/common/covidcast/receiving/nssp", + "log_filename": "/var/log/indicators/nssp.log", + "log_exceptions": false + }, + "indicator": { + "wip_signal": true, + "static_file_dir": "./static", + "socrata_token": "{{ nssp_token }}" + }, + "validation": { + "common": { + "data_source": "nssp", + "api_credentials": "{{ validation_api_key }}", + "span_length": 15, + "min_expected_lag": {"all": "7"}, + "max_expected_lag": {"all": "13"}, + "dry_run": true, + "suppressed_errors": [] + }, + "static": { + "minimum_sample_size": 0, + "missing_se_allowed": true, + "missing_sample_size_allowed": true + }, + "dynamic": {} + } +} diff --git a/ansible/templates/sir_complainsalot-params-prod.json.j2 b/ansible/templates/sir_complainsalot-params-prod.json.j2 index d6f5da081..e44f164b6 100644 --- a/ansible/templates/sir_complainsalot-params-prod.json.j2 +++ b/ansible/templates/sir_complainsalot-params-prod.json.j2 @@ -12,11 +12,6 @@ "maintainers": ["U01AP8GSWG3","U01069KCRS7"], "retired-signals": ["smoothed_covid19","smoothed_adj_covid19"] }, - "chng": { - "max_age": 6, - "maintainers": ["U01AP8GSWG3","U01069KCRS7"], - "retired-signals": ["7dav_outpatient_covid","7dav_inpatient_covid"] - }, "google-symptoms": { "max_age": 6, "maintainers": ["U01AP8GSWG3","U01069KCRS7"], @@ -47,8 +42,8 @@ "max_age":19, "maintainers": [] }, - "hhs": { - "max_age":15, + "nssp": { + "max_age":13, "maintainers": [] } } diff --git a/ansible/vars.yaml b/ansible/vars.yaml index 8e059c873..ff9ba135c 100644 --- a/ansible/vars.yaml +++ b/ansible/vars.yaml @@ -56,6 +56,9 @@ nchs_mortality_token: "{{ vault_cdc_socrata_token }}" # NWSS nwss_wastewater_token: "{{ vault_cdc_socrata_token }}" +# nssp +nssp_token: "{{ vault_cdc_socrata_token }}" + # SirCAL sir_complainsalot_api_key: "{{ vault_sir_complainsalot_api_key }}" sir_complainsalot_slack_token: "{{ vault_sir_complainsalot_slack_token }}" diff --git a/backfill_corrections/Dockerfile b/backfill_corrections/Dockerfile index 8d2bc8ea2..6d862508b 100644 --- a/backfill_corrections/Dockerfile +++ b/backfill_corrections/Dockerfile @@ -1,7 +1,7 @@ FROM gurobi/optimizer:9.5.1 as gurobi ## Install R and tidyverse -FROM rocker/tidyverse:latest +FROM rocker/tidyverse:4.2 WORKDIR /opt/gurobi COPY --from=gurobi /opt/gurobi . @@ -15,22 +15,17 @@ ENV LD_LIBRARY_PATH $GUROBI_HOME/lib RUN ln -s -f /usr/share/zoneinfo/America/New_York /etc/localtime RUN apt-get update && apt-get install -qq -y \ - libglpk-dev\ + apt-file \ python3-venv \ python3-dev \ python3-pip -RUN install2.r --error \ - roxygen2 \ - Rglpk \ - argparser - +RUN R -e 'install.packages("pak", repos = sprintf("https://r-lib.github.io/p/pak/stable/%s/%s/%s", .Platform$pkgType, R.Version()$os, R.Version()$arch))' +RUN R -e 'install.packages(c("rspm"))' RUN --mount=type=secret,id=GITHUB_TOKEN \ export GITHUB_PAT="$(cat /run/secrets/GITHUB_TOKEN)" && \ - R -e 'devtools::install_version("bettermc", version = "1.1.2")' && \ - R -e 'devtools::install_github("cmu-delphi/covidcast", ref = "evalcast", subdir = "R-packages/evalcast")' && \ - R -e 'devtools::install_github(repo="ryantibs/quantgen", subdir="quantgen")' && \ - R -e 'install.packages(list.files(path="/opt/gurobi/linux64/R/", pattern="^gurobi_.*[.]tar[.]gz$", full.names = TRUE), repos=NULL)' + R -e 'rspm::enable(); pak::pkg_install(c("roxygen2", "Rglpk", "argparser", "gfkse/bettermc@v1.1.2", "cmu-delphi/covidcast/R-packages/evalcast@evalcast", "ryantibs/quantgen/quantgen"))' +RUN R -e 'install.packages(list.files(path="/opt/gurobi/linux64/R/", pattern="^gurobi_.*[.]tar[.]gz$", full.names = TRUE), repos=NULL)' WORKDIR /backfill_corrections/ ADD ./delphiBackfillCorrection ./delphiBackfillCorrection/ diff --git a/changehc/.pylintrc b/changehc/.pylintrc deleted file mode 100644 index c71c52434..000000000 --- a/changehc/.pylintrc +++ /dev/null @@ -1,24 +0,0 @@ - -[MESSAGES CONTROL] - -disable=logging-format-interpolation, - too-many-locals, - too-many-arguments, - # Allow pytest functions to be part of a class. - no-self-use, - # Allow pytest classes to have one test. - too-few-public-methods, - # Ignore - R0903, C0301, R0914, C0103, W1203, E0611, R0902, R0913, W0105, W0611, W1401 - -[BASIC] - -# Allow arbitrarily short-named variables. -variable-rgx=[a-z_][a-z0-9_]* -argument-rgx=[a-z_][a-z0-9_]* -attr-rgx=[a-z_][a-z0-9_]* - -[DESIGN] - -# Don't complain about pytest "unused" arguments. -ignored-argument-names=(_.*|run_as_module) \ No newline at end of file diff --git a/changehc/Makefile b/changehc/Makefile index bc88f1fec..390113eef 100644 --- a/changehc/Makefile +++ b/changehc/Makefile @@ -17,9 +17,12 @@ install-ci: venv pip install . lint: - . env/bin/activate; pylint $(dir) + . env/bin/activate; pylint $(dir) --rcfile=../pyproject.toml . env/bin/activate; pydocstyle $(dir) +format: + . env/bin/activate; darker $(dir) + test: . env/bin/activate ;\ (cd tests && ../env/bin/pytest --cov=$(dir) --cov-report=term-missing) diff --git a/changehc/delphi_changehc/download_ftp_files.py b/changehc/delphi_changehc/download_ftp_files.py index f85ef9944..dad47bb3f 100644 --- a/changehc/delphi_changehc/download_ftp_files.py +++ b/changehc/delphi_changehc/download_ftp_files.py @@ -1,7 +1,6 @@ """Download files from the specified ftp server.""" # standard -import datetime import functools from os import path diff --git a/changehc/delphi_changehc/sensor.py b/changehc/delphi_changehc/sensor.py index d1422567b..0449f07df 100644 --- a/changehc/delphi_changehc/sensor.py +++ b/changehc/delphi_changehc/sensor.py @@ -6,9 +6,6 @@ """ -# standard packages -import logging - # third party import numpy as np import pandas as pd diff --git a/changehc/delphi_changehc/update_sensor.py b/changehc/delphi_changehc/update_sensor.py index cb5b42a4b..edae85517 100644 --- a/changehc/delphi_changehc/update_sensor.py +++ b/changehc/delphi_changehc/update_sensor.py @@ -5,18 +5,16 @@ Created: 2020-10-14 """ # standard packages -import logging from multiprocessing import Pool, cpu_count # third party import numpy as np import pandas as pd -from delphi_utils import GeoMapper, add_prefix, create_export_csv, Weekday +from delphi_utils import GeoMapper, Weekday, add_prefix, create_export_csv # first party from .config import Config -from .constants import SMOOTHED, SMOOTHED_ADJ, SMOOTHED_CLI, SMOOTHED_ADJ_CLI,\ - SMOOTHED_FLU, SMOOTHED_ADJ_FLU, NA +from .constants import SMOOTHED, SMOOTHED_ADJ, SMOOTHED_ADJ_CLI, SMOOTHED_ADJ_FLU, SMOOTHED_CLI, SMOOTHED_FLU from .sensor import CHCSensor @@ -173,10 +171,11 @@ def geo_reindex(self, data): unique_geo_ids = pd.unique(data_frame[geo]) data_frame.set_index([geo, Config.DATE_COL],inplace=True) # for each location, fill in all missing dates with 0 values - multiindex = pd.MultiIndex.from_product((unique_geo_ids, self.fit_dates), - names=[geo, Config.DATE_COL]) - assert (len(multiindex) <= (len(gmpr.get_geo_values(gmpr.as_mapper_name(geo))) * len(self.fit_dates)) - ), f"more loc-date pairs than maximum number of geographies x number of dates, length of multiindex is {len(multiindex)}, geo level is {geo}" + multiindex = pd.MultiIndex.from_product((unique_geo_ids, self.fit_dates), names=[geo, Config.DATE_COL]) + assert len(multiindex) <= (len(gmpr.get_geo_values(gmpr.as_mapper_name(geo))) * len(self.fit_dates)), ( + "more loc-date pairs than maximum number of geographies x number of dates, " + f"length of multiindex is {len(multiindex)}, geo level is {geo}" + ) # fill dataframe with missing dates using 0 data_frame = data_frame.reindex(multiindex, fill_value=0) @@ -201,14 +200,17 @@ def update_sensor(self, data.reset_index(inplace=True) data_frame = self.geo_reindex(data) # handle if we need to adjust by weekday - wd_params = Weekday.get_params( - data_frame, - "den", - ["num"], - Config.DATE_COL, - [1, 1e5], - self.logger, - ) if self.weekday else None + if self.weekday: + wd_params = Weekday.get_params_legacy( + data_frame, + "den", + ["num"], + Config.DATE_COL, + [1, 1e5], + self.logger, + ) + else: + wd_params = None # run sensor fitting code (maybe in parallel) if not self.parallel: dfs = [] diff --git a/changehc/setup.py b/changehc/setup.py index d46649391..d95beb771 100644 --- a/changehc/setup.py +++ b/changehc/setup.py @@ -2,18 +2,20 @@ from setuptools import find_packages required = [ + "boto3", + "covidcast", + "darker[isort]~=2.1.1", + "delphi-utils", + "moto~=4.2.14", "numpy", "pandas", + "paramiko", "pyarrow", "pydocstyle", - "pytest", - "pytest-cov", "pylint==2.8.3", - "delphi-utils", - "covidcast", - "boto3", - "moto~=4.2.14", - "paramiko" + "pytest-cov", + "pytest", + "cvxpy<1.6", ] setup( diff --git a/changehc/version.cfg b/changehc/version.cfg index d3d61ed12..f5c28d2cd 100644 --- a/changehc/version.cfg +++ b/changehc/version.cfg @@ -1 +1 @@ -current_version = 0.3.54 +current_version = 0.3.55 diff --git a/claims_hosp/.pylintrc b/claims_hosp/.pylintrc deleted file mode 100644 index 7fc2f5c30..000000000 --- a/claims_hosp/.pylintrc +++ /dev/null @@ -1,23 +0,0 @@ - -[MESSAGES CONTROL] - -disable=logging-format-interpolation, - too-many-locals, - too-many-arguments, - # Allow pytest functions to be part of a class. - no-self-use, - # Allow pytest classes to have one test. - too-few-public-methods, - - -[BASIC] - -# Allow arbitrarily short-named variables. -variable-rgx=[a-z_][a-z0-9_]* -argument-rgx=[a-z_][a-z0-9_]* -attr-rgx=[a-z_][a-z0-9_]* - -[DESIGN] - -# Don't complain about pytest "unused" arguments. -ignored-argument-names=(_.*|run_as_module) \ No newline at end of file diff --git a/claims_hosp/Makefile b/claims_hosp/Makefile index bc88f1fec..390113eef 100644 --- a/claims_hosp/Makefile +++ b/claims_hosp/Makefile @@ -17,9 +17,12 @@ install-ci: venv pip install . lint: - . env/bin/activate; pylint $(dir) + . env/bin/activate; pylint $(dir) --rcfile=../pyproject.toml . env/bin/activate; pydocstyle $(dir) +format: + . env/bin/activate; darker $(dir) + test: . env/bin/activate ;\ (cd tests && ../env/bin/pytest --cov=$(dir) --cov-report=term-missing) diff --git a/claims_hosp/delphi_claims_hosp/smooth.py b/claims_hosp/delphi_claims_hosp/smooth.py index a66bcc25c..56b132fa2 100644 --- a/claims_hosp/delphi_claims_hosp/smooth.py +++ b/claims_hosp/delphi_claims_hosp/smooth.py @@ -27,13 +27,11 @@ def left_gauss_linear(arr, bandwidth=Config.SMOOTHER_BANDWIDTH): """ n_rows = len(arr) out_arr = np.zeros_like(arr) - X = np.vstack([np.ones(n_rows), np.arange(n_rows)]).T # pylint: disable=invalid-name + X = np.vstack([np.ones(n_rows), np.arange(n_rows)]).T for idx in range(n_rows): weights = np.exp(-((np.arange(idx + 1) - idx) ** 2) / bandwidth) - # pylint: disable=invalid-name XwX = np.dot(X[: (idx + 1), :].T * weights, X[: (idx + 1), :]) Xwy = np.dot(X[: (idx + 1), :].T * weights, arr[: (idx + 1)].reshape(-1, 1)) - # pylint: enable=invalid-name try: beta = np.linalg.solve(XwX, Xwy) out_arr[idx] = np.dot(X[: (idx + 1), :], beta)[-1] diff --git a/claims_hosp/delphi_claims_hosp/update_indicator.py b/claims_hosp/delphi_claims_hosp/update_indicator.py index b4169370d..df3f3308f 100644 --- a/claims_hosp/delphi_claims_hosp/update_indicator.py +++ b/claims_hosp/delphi_claims_hosp/update_indicator.py @@ -13,13 +13,13 @@ # third party import numpy as np import pandas as pd -from delphi_utils import GeoMapper # first party -from delphi_utils import Weekday +from delphi_utils import GeoMapper, Weekday + from .config import Config, GeoConstants -from .load_data import load_data from .indicator import ClaimsHospIndicator +from .load_data import load_data class ClaimsHospIndicatorUpdater: @@ -152,15 +152,18 @@ def update_indicator(self, input_filepath, outpath, logger): data_frame = self.geo_reindex(data) # handle if we need to adjust by weekday - wd_params = Weekday.get_params( - data_frame, - "den", - ["num"], - Config.DATE_COL, - [1, 1e5], - logger, - ) if self.weekday else None - + wd_params = ( + Weekday.get_params_legacy( + data_frame, + "den", + ["num"], + Config.DATE_COL, + [1, 1e5], + logger, + ) + if self.weekday + else None + ) # run fitting code (maybe in parallel) rates = {} std_errs = {} diff --git a/claims_hosp/setup.py b/claims_hosp/setup.py index bc50a6414..3b859c294 100644 --- a/claims_hosp/setup.py +++ b/claims_hosp/setup.py @@ -2,16 +2,19 @@ from setuptools import find_packages required = [ + "covidcast", + "darker[isort]~=2.1.1", + "delphi-utils", "numpy", "pandas", - "pyarrow", "paramiko", + "pyarrow", "pydocstyle", - "pytest", - "pytest-cov", "pylint==2.8.3", - "delphi-utils", - "covidcast" + "pytest-cov", + "pytest", + "cvxpy<1.6", + "scs<3.2.6", # TODO: remove this ; it is a cvxpy dependency, and the excluded version appears to break our jenkins build. see: https://github.com/cvxgrp/scs/issues/283 ] setup( diff --git a/claims_hosp/version.cfg b/claims_hosp/version.cfg index d3d61ed12..f5c28d2cd 100644 --- a/claims_hosp/version.cfg +++ b/claims_hosp/version.cfg @@ -1 +1 @@ -current_version = 0.3.54 +current_version = 0.3.55 diff --git a/doctor_visits/.pylintrc b/doctor_visits/.pylintrc deleted file mode 100644 index a14b269cc..000000000 --- a/doctor_visits/.pylintrc +++ /dev/null @@ -1,8 +0,0 @@ -[DESIGN] - -min-public-methods=0 - - -[MESSAGES CONTROL] - -disable=R0801, C0200, C0330, E1101, E0611, E1136, C0114, C0116, C0103, R0913, R0914, R0915, W1401, W1202, W1203, W0702 diff --git a/doctor_visits/Makefile b/doctor_visits/Makefile index bc88f1fec..390113eef 100644 --- a/doctor_visits/Makefile +++ b/doctor_visits/Makefile @@ -17,9 +17,12 @@ install-ci: venv pip install . lint: - . env/bin/activate; pylint $(dir) + . env/bin/activate; pylint $(dir) --rcfile=../pyproject.toml . env/bin/activate; pydocstyle $(dir) +format: + . env/bin/activate; darker $(dir) + test: . env/bin/activate ;\ (cd tests && ../env/bin/pytest --cov=$(dir) --cov-report=term-missing) diff --git a/doctor_visits/README.md b/doctor_visits/README.md index ba2acf43e..9a9ac07b5 100644 --- a/doctor_visits/README.md +++ b/doctor_visits/README.md @@ -53,3 +53,9 @@ The output will show the number of unit tests that passed and failed, along with the percentage of code covered by the tests. None of the tests should fail and the code lines that are not covered by unit tests should be small and should not include critical sub-routines. + +## Running Patches: +To get data issued during specific date range, output in batch issue format, adjust `params.json` in accordance with `patch.py`, then run +``` +env/bin/python -m delphi_doctor_visits.patch +``` diff --git a/doctor_visits/delphi_doctor_visits/download_claims_ftp_files.py b/doctor_visits/delphi_doctor_visits/download_claims_ftp_files.py index 002d4a7c9..efd110d8b 100644 --- a/doctor_visits/delphi_doctor_visits/download_claims_ftp_files.py +++ b/doctor_visits/delphi_doctor_visits/download_claims_ftp_files.py @@ -51,9 +51,13 @@ def change_date_format(name): name = '_'.join(split_name) return name -def download(ftp_credentials, out_path, logger): +def download(ftp_credentials, out_path, logger, issue_date=None): """Pull the latest raw files.""" - current_time = datetime.datetime.now() + if not issue_date: + current_time = datetime.datetime.now() + else: + current_time = datetime.datetime.strptime(issue_date, "%Y-%m-%d").replace(hour=23, minute=59, second=59) + logger.info("starting download", time=current_time) seconds_in_day = 24 * 60 * 60 diff --git a/doctor_visits/delphi_doctor_visits/get_latest_claims_name.py b/doctor_visits/delphi_doctor_visits/get_latest_claims_name.py index e417183c7..0a86d532f 100644 --- a/doctor_visits/delphi_doctor_visits/get_latest_claims_name.py +++ b/doctor_visits/delphi_doctor_visits/get_latest_claims_name.py @@ -5,9 +5,12 @@ import datetime from pathlib import Path -def get_latest_filename(dir_path, logger): +def get_latest_filename(dir_path, logger, issue_date=None): """Get the latest filename from the list of downloaded raw files.""" - current_date = datetime.datetime.now() + if issue_date: + current_date = datetime.datetime.strptime(issue_date, "%Y-%m-%d").replace(hour=23, minute=59, second=59) + else: + current_date = datetime.datetime.now() files = list(Path(dir_path).glob("*")) latest_timestamp = datetime.datetime(1900, 1, 1) @@ -24,7 +27,7 @@ def get_latest_filename(dir_path, logger): latest_timestamp = timestamp latest_filename = file - assert current_date.date() == latest_timestamp.date(), "no drop for today" + assert current_date.date() == latest_timestamp.date(), f"no drop for {current_date}" logger.info("Latest claims file", filename=latest_filename) diff --git a/doctor_visits/delphi_doctor_visits/patch.py b/doctor_visits/delphi_doctor_visits/patch.py new file mode 100644 index 000000000..32b6d308f --- /dev/null +++ b/doctor_visits/delphi_doctor_visits/patch.py @@ -0,0 +1,71 @@ +""" +This module is used for patching data in the delphi_doctor_visits package. + +To use this module, you need to specify the range of issue dates in params.json, like so: + +{ + "common": { + ... + }, + "validation": { + ... + }, + "patch": { + "patch_dir": "/Users/minhkhuele/Desktop/delphi/covidcast-indicators/doctor_visits/AprilPatch", + "start_issue": "2024-04-20", + "end_issue": "2024-04-21" + } +} + +It will generate data for that range of issue dates, and store them in batch issue format: +[name-of-patch]/issue_[issue-date]/doctor-visits/actual_data_file.csv +""" + +from datetime import datetime, timedelta +from os import makedirs + +from delphi_utils import get_structured_logger, read_params + +from .run import run_module + + +def patch(): + """ + Run the doctor visits indicator for a range of issue dates. + + The range of issue dates is specified in params.json using the following keys: + - "patch": Only used for patching data + - "start_date": str, YYYY-MM-DD format, first issue date + - "end_date": str, YYYY-MM-DD format, last issue date + - "patch_dir": str, directory to write all issues output + """ + params = read_params() + logger = get_structured_logger("delphi_doctor_visits.patch", filename=params["common"]["log_filename"]) + + start_issue = datetime.strptime(params["patch"]["start_issue"], "%Y-%m-%d") + end_issue = datetime.strptime(params["patch"]["end_issue"], "%Y-%m-%d") + + logger.info(f"""Start patching {params["patch"]["patch_dir"]}""") + logger.info(f"""Start issue: {start_issue.strftime("%Y-%m-%d")}""") + logger.info(f"""End issue: {end_issue.strftime("%Y-%m-%d")}""") + + makedirs(params["patch"]["patch_dir"], exist_ok=True) + + current_issue = start_issue + + while current_issue <= end_issue: + logger.info(f"""Running issue {current_issue.strftime("%Y-%m-%d")}""") + + params["patch"]["current_issue"] = current_issue.strftime("%Y-%m-%d") + + current_issue_yyyymmdd = current_issue.strftime("%Y%m%d") + current_issue_dir = f"""{params["patch"]["patch_dir"]}/issue_{current_issue_yyyymmdd}/doctor-visits""" + makedirs(f"{current_issue_dir}", exist_ok=True) + params["common"]["export_dir"] = f"""{current_issue_dir}""" + + run_module(params, logger) + current_issue += timedelta(days=1) + + +if __name__ == "__main__": + patch() diff --git a/doctor_visits/delphi_doctor_visits/run.py b/doctor_visits/delphi_doctor_visits/run.py index 93c346ee7..3c941534a 100644 --- a/doctor_visits/delphi_doctor_visits/run.py +++ b/doctor_visits/delphi_doctor_visits/run.py @@ -20,7 +20,7 @@ from .get_latest_claims_name import get_latest_filename -def run_module(params): +def run_module(params, logger=None): # pylint: disable=too-many-statements """ Run doctor visits indicator. @@ -42,18 +42,26 @@ def run_module(params): - "se": bool, whether to write out standard errors - "obfuscated_prefix": str, prefix for signal name if write_se is True. - "parallel": bool, whether to update sensor in parallel. + - "patch": Only used for patching data, remove if not patching. + Check out patch.py and README for more details on how to run patches. + - "start_date": str, YYYY-MM-DD format, first issue date + - "end_date": str, YYYY-MM-DD format, last issue date + - "patch_dir": str, directory to write all issues output """ start_time = time.time() - logger = get_structured_logger( - __name__, filename=params["common"].get("log_filename"), - log_exceptions=params["common"].get("log_exceptions", True)) + issue_date = params.get("patch", {}).get("current_issue", None) + if not logger: + logger = get_structured_logger( + __name__, + filename=params["common"].get("log_filename"), + log_exceptions=params["common"].get("log_exceptions", True), + ) # pull latest data - download(params["indicator"]["ftp_credentials"], - params["indicator"]["input_dir"], logger) + download(params["indicator"]["ftp_credentials"], params["indicator"]["input_dir"], logger, issue_date=issue_date) # find the latest files (these have timestamps) - claims_file = get_latest_filename(params["indicator"]["input_dir"], logger) + claims_file = get_latest_filename(params["indicator"]["input_dir"], logger, issue_date=issue_date) # modify data modify_and_write(claims_file, logger) diff --git a/doctor_visits/delphi_doctor_visits/smooth.py b/doctor_visits/delphi_doctor_visits/smooth.py index 72f691942..d24f1b85f 100644 --- a/doctor_visits/delphi_doctor_visits/smooth.py +++ b/doctor_visits/delphi_doctor_visits/smooth.py @@ -22,7 +22,7 @@ def moving_avg(x, y, k=7): """ n = len(y) sy = np.zeros((n - k + 1, 1)) - for i in range(len(sy)): + for i in range(len(sy)): # pylint: disable=consider-using-enumerate sy[i] = np.mean(y[i : (i + k)]) return x[(k - 1) :], sy @@ -39,7 +39,7 @@ def padded_moving_avg(y, k=7): """ n = len(y) sy = np.zeros((n - k + 1, 1)) - for i in range(len(sy)): + for i in range(len(sy)): # pylint: disable=consider-using-enumerate sy[i] = np.mean(y[i : (i + k)]) # pad first k obs with 0 diff --git a/doctor_visits/delphi_doctor_visits/update_sensor.py b/doctor_visits/delphi_doctor_visits/update_sensor.py index 019c3f9d5..125c0df18 100644 --- a/doctor_visits/delphi_doctor_visits/update_sensor.py +++ b/doctor_visits/delphi_doctor_visits/update_sensor.py @@ -18,6 +18,7 @@ # first party from delphi_utils import Weekday + from .config import Config from .geo_maps import GeoMaps from .sensor import DoctorVisitsSensor @@ -125,15 +126,19 @@ def update_sensor( (burn_in_dates >= startdate) & (burn_in_dates <= enddate))[0][:len(sensor_dates)] # handle if we need to adjust by weekday - params = Weekday.get_params( - data, - "Denominator", - Config.CLI_COLS + Config.FLU1_COL, - Config.DATE_COL, - [1, 1e5, 1e10, 1e15], - logger, - ) if weekday else None - if weekday and np.any(np.all(params == 0,axis=1)): + params = ( + Weekday.get_params( + data, + "Denominator", + Config.CLI_COLS + Config.FLU1_COL, + Config.DATE_COL, + [1, 1e5, 1e10, 1e15], + logger, + ) + if weekday + else None + ) + if weekday and np.any(np.all(params == 0, axis=1)): # Weekday correction failed for at least one count type return None diff --git a/doctor_visits/setup.py b/doctor_visits/setup.py index faba7c670..17d6fc9af 100644 --- a/doctor_visits/setup.py +++ b/doctor_visits/setup.py @@ -2,14 +2,17 @@ from setuptools import find_packages required = [ + "darker[isort]~=2.1.1", + "delphi-utils", "numpy", "pandas", "paramiko", - "scikit-learn", - "pytest", - "pytest-cov", "pylint==2.8.3", - "delphi-utils" + "pytest-cov", + "pytest", + "scikit-learn", + "cvxpy>=1.5", + "scs<3.2.6", # TODO: remove this ; it is a cvxpy dependency, and the excluded version appears to break our jenkins build. see: https://github.com/cvxgrp/scs/issues/283 ] setup( diff --git a/doctor_visits/tests/test_download.py b/doctor_visits/tests/test_download.py new file mode 100644 index 000000000..dc94e534c --- /dev/null +++ b/doctor_visits/tests/test_download.py @@ -0,0 +1,28 @@ +import unittest +from unittest.mock import patch, MagicMock +from delphi_doctor_visits.download_claims_ftp_files import download + +class TestDownload(unittest.TestCase): + @patch('delphi_doctor_visits.download_claims_ftp_files.paramiko.SSHClient') + @patch('delphi_doctor_visits.download_claims_ftp_files.path.exists', return_value=False) + def test_download(self, mock_exists, mock_sshclient): + mock_sshclient_instance = MagicMock() + mock_sshclient.return_value = mock_sshclient_instance + mock_sftp = MagicMock() + mock_sshclient_instance.open_sftp.return_value = mock_sftp + mock_sftp.listdir_attr.return_value = [MagicMock(filename="SYNEDI_AGG_OUTPATIENT_20200207_1455CDT.csv.gz")] + ftp_credentials = {"host": "test_host", "user": "test_user", "pass": "test_pass", "port": "test_port"} + out_path = "./test_data/" + logger = MagicMock() + + #case 1: download with issue_date that does not exist on ftp server + download(ftp_credentials, out_path, logger, issue_date="2020-02-08") + mock_sshclient_instance.connect.assert_called_once_with(ftp_credentials["host"], username=ftp_credentials["user"], password=ftp_credentials["pass"], port=ftp_credentials["port"]) + mock_sftp.get.assert_not_called() + + # case 2: download with issue_date that exists on ftp server + download(ftp_credentials, out_path, logger, issue_date="2020-02-07") + mock_sftp.get.assert_called() + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/doctor_visits/tests/test_get_latest_claims_name.py b/doctor_visits/tests/test_get_latest_claims_name.py index 98bd19e2d..d1003ad47 100644 --- a/doctor_visits/tests/test_get_latest_claims_name.py +++ b/doctor_visits/tests/test_get_latest_claims_name.py @@ -11,9 +11,12 @@ class TestGetLatestFileName: logger = Mock() - + dir_path = "test_data" + def test_get_latest_claims_name(self): - dir_path = "./test_data/" - with pytest.raises(AssertionError): - get_latest_filename(dir_path, self.logger) + get_latest_filename(self.dir_path, self.logger) + + def test_get_latest_claims_name_with_issue_date(self): + result = get_latest_filename(self.dir_path, self.logger, issue_date="2020-02-07") + assert str(result) == f"{self.dir_path}/SYNEDI_AGG_OUTPATIENT_07022020_1455CDT.csv.gz" diff --git a/doctor_visits/tests/test_patch.py b/doctor_visits/tests/test_patch.py new file mode 100644 index 000000000..5b4575a09 --- /dev/null +++ b/doctor_visits/tests/test_patch.py @@ -0,0 +1,37 @@ +import unittest +from unittest.mock import patch as mock_patch, call +from delphi_doctor_visits.patch import patch +import os +import shutil + +class TestPatchModule(unittest.TestCase): + def test_patch(self): + with mock_patch('delphi_doctor_visits.patch.run_module') as mock_run_module, \ + mock_patch('delphi_doctor_visits.patch.get_structured_logger') as mock_get_structured_logger, \ + mock_patch('delphi_doctor_visits.patch.read_params') as mock_read_params: + + mock_read_params.return_value = { + "common": { + "log_filename": "test.log" + }, + "patch": { + "start_issue": "2021-01-01", + "end_issue": "2021-01-02", + "patch_dir": "./patch_dir" + } + } + + patch() + + self.assertIn('current_issue', mock_read_params.return_value['patch']) + self.assertEqual(mock_read_params.return_value['patch']['current_issue'], '2021-01-02') + + self.assertTrue(os.path.isdir('./patch_dir')) + self.assertTrue(os.path.isdir('./patch_dir/issue_20210101/doctor-visits')) + self.assertTrue(os.path.isdir('./patch_dir/issue_20210102/doctor-visits')) + + # Clean up the created directories after the test + shutil.rmtree(mock_read_params.return_value["patch"]["patch_dir"]) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/doctor_visits/version.cfg b/doctor_visits/version.cfg index d3d61ed12..f5c28d2cd 100644 --- a/doctor_visits/version.cfg +++ b/doctor_visits/version.cfg @@ -1 +1 @@ -current_version = 0.3.54 +current_version = 0.3.55 diff --git a/google_symptoms/.pylintrc b/google_symptoms/.pylintrc deleted file mode 100644 index f337ecf9c..000000000 --- a/google_symptoms/.pylintrc +++ /dev/null @@ -1,8 +0,0 @@ -[DESIGN] - -min-public-methods=1 - - -[MESSAGES CONTROL] - -disable=R0801, E1101, E0611, C0114, C0116, C0103, R0913, R0914, W0702, W0707 diff --git a/google_symptoms/Makefile b/google_symptoms/Makefile index f6a5b7e63..6884278cf 100644 --- a/google_symptoms/Makefile +++ b/google_symptoms/Makefile @@ -17,9 +17,12 @@ install-ci: venv pip install . lint: - . env/bin/activate; pylint $(dir) + . env/bin/activate; pylint $(dir) --rcfile=../pyproject.toml . env/bin/activate; pydocstyle $(dir) +format: + . env/bin/activate; darker $(dir) + test: . env/bin/activate ; (cd tests && ../env/bin/pytest --cov=$(dir) --cov-report=term-missing) diff --git a/google_symptoms/delphi_google_symptoms/pull.py b/google_symptoms/delphi_google_symptoms/pull.py index 29def6b4e..d5921a3e4 100644 --- a/google_symptoms/delphi_google_symptoms/pull.py +++ b/google_symptoms/delphi_google_symptoms/pull.py @@ -67,7 +67,7 @@ def preprocess(df, level): try: df = df[KEEP_COLUMNS] except KeyError: - raise ValueError( + raise ValueError( # pylint: disable=raise-missing-from "Some necessary columns are missing. The dataset " "schema may have changed. Please investigate." ) diff --git a/google_symptoms/setup.py b/google_symptoms/setup.py index 91af03e64..ccba3c47a 100644 --- a/google_symptoms/setup.py +++ b/google_symptoms/setup.py @@ -2,17 +2,18 @@ from setuptools import find_packages required = [ + "darker[isort]~=2.1.1", + "db-dtypes", + "delphi-utils", + "freezegun", "mock", "numpy", + "pandas-gbq", "pandas", "pydocstyle", - "pytest", - "pytest-cov", "pylint==2.8.3", - "delphi-utils", - "freezegun", - "pandas-gbq", - "db-dtypes" + "pytest-cov", + "pytest", ] setup( diff --git a/google_symptoms/version.cfg b/google_symptoms/version.cfg index d3d61ed12..f5c28d2cd 100644 --- a/google_symptoms/version.cfg +++ b/google_symptoms/version.cfg @@ -1 +1 @@ -current_version = 0.3.54 +current_version = 0.3.55 diff --git a/hhs_hosp/.pylintrc b/hhs_hosp/.pylintrc deleted file mode 100644 index 58c6edbba..000000000 --- a/hhs_hosp/.pylintrc +++ /dev/null @@ -1,22 +0,0 @@ - -[MESSAGES CONTROL] - -disable=logging-format-interpolation, - too-many-locals, - too-many-arguments, - # Allow pytest functions to be part of a class. - no-self-use, - # Allow pytest classes to have one test. - too-few-public-methods - -[BASIC] - -# Allow arbitrarily short-named variables. -variable-rgx=[a-z_][a-z0-9_]* -argument-rgx=[a-z_][a-z0-9_]* -attr-rgx=[a-z_][a-z0-9_]* - -[DESIGN] - -# Don't complain about pytest "unused" arguments. -ignored-argument-names=(_.*|run_as_module) diff --git a/hhs_hosp/Makefile b/hhs_hosp/Makefile index ea591dcb5..69529feb7 100644 --- a/hhs_hosp/Makefile +++ b/hhs_hosp/Makefile @@ -17,9 +17,12 @@ install-ci: venv pip install . lint: - . env/bin/activate; pylint $(dir) + . env/bin/activate; pylint $(dir) --rcfile=../pyproject.toml . env/bin/activate; pydocstyle $(dir) +format: + . env/bin/activate; darker $(dir) + test: . env/bin/activate ;\ (cd tests && ../env/bin/pytest --cov=$(dir) --cov-report=term-missing) diff --git a/hhs_hosp/setup.py b/hhs_hosp/setup.py index b19bcbb42..90a685ac8 100644 --- a/hhs_hosp/setup.py +++ b/hhs_hosp/setup.py @@ -2,16 +2,17 @@ from setuptools import find_packages required = [ + "covidcast", + "darker[isort]~=2.1.1", + "delphi-epidata", + "delphi-utils", "freezegun", "numpy", "pandas", "pydocstyle", - "pytest", - "pytest-cov", "pylint==2.8.3", - "delphi-utils", - "covidcast", - "delphi-epidata" + "pytest-cov", + "pytest", ] setup( diff --git a/hhs_hosp/version.cfg b/hhs_hosp/version.cfg index d3d61ed12..f5c28d2cd 100644 --- a/hhs_hosp/version.cfg +++ b/hhs_hosp/version.cfg @@ -1 +1 @@ -current_version = 0.3.54 +current_version = 0.3.55 diff --git a/nchs_mortality/.pylintrc b/nchs_mortality/.pylintrc deleted file mode 100644 index c72b4e124..000000000 --- a/nchs_mortality/.pylintrc +++ /dev/null @@ -1,24 +0,0 @@ - -[MESSAGES CONTROL] - -disable=logging-format-interpolation, - too-many-locals, - too-many-arguments, - too-many-branches, - too-many-statements, - # Allow pytest functions to be part of a class. - no-self-use, - # Allow pytest classes to have one test. - too-few-public-methods - -[BASIC] - -# Allow arbitrarily short-named variables. -variable-rgx=[a-z_][a-z0-9_]* -argument-rgx=[a-z_][a-z0-9_]* -attr-rgx=[a-z_][a-z0-9_]* - -[DESIGN] - -# Don't complain about pytest "unused" arguments. -ignored-argument-names=(_.*|run_as_module) \ No newline at end of file diff --git a/nchs_mortality/Makefile b/nchs_mortality/Makefile index bc88f1fec..390113eef 100644 --- a/nchs_mortality/Makefile +++ b/nchs_mortality/Makefile @@ -17,9 +17,12 @@ install-ci: venv pip install . lint: - . env/bin/activate; pylint $(dir) + . env/bin/activate; pylint $(dir) --rcfile=../pyproject.toml . env/bin/activate; pydocstyle $(dir) +format: + . env/bin/activate; darker $(dir) + test: . env/bin/activate ;\ (cd tests && ../env/bin/pytest --cov=$(dir) --cov-report=term-missing) diff --git a/nchs_mortality/setup.py b/nchs_mortality/setup.py index 76915936b..3fe354ba4 100644 --- a/nchs_mortality/setup.py +++ b/nchs_mortality/setup.py @@ -2,16 +2,17 @@ from setuptools import find_packages required = [ + "darker[isort]~=2.1.1", + "delphi-utils", + "epiweeks", + "freezegun", "numpy", "pandas", "pydocstyle", - "pytest", - "pytest-cov", "pylint==2.8.3", - "delphi-utils", + "pytest-cov", + "pytest", "sodapy", - "epiweeks", - "freezegun", ] setup( diff --git a/nchs_mortality/version.cfg b/nchs_mortality/version.cfg index d3d61ed12..f5c28d2cd 100644 --- a/nchs_mortality/version.cfg +++ b/nchs_mortality/version.cfg @@ -1 +1 @@ -current_version = 0.3.54 +current_version = 0.3.55 diff --git a/notebooks/.Rprofile b/notebooks/.Rprofile new file mode 100644 index 000000000..81b960f5c --- /dev/null +++ b/notebooks/.Rprofile @@ -0,0 +1 @@ +source("renv/activate.R") diff --git a/notebooks/nssp/cor_dashboard.Rmd b/notebooks/nssp/cor_dashboard.Rmd new file mode 100644 index 000000000..58d3ed6a0 --- /dev/null +++ b/notebooks/nssp/cor_dashboard.Rmd @@ -0,0 +1,257 @@ +--- +title: "Correlation Analyses for COVID-19 Indicators" +author: "Delphi Group" +date: "`r format(Sys.time(), '%B %d, %Y')`" +output: + html_document: + code_folding: hide +--- + +```{r, include = FALSE} +knitr::opts_chunk$set(message = FALSE, warning = FALSE, fig.width = 8, + fig.height = 7) +``` + +### Getting data +This requires that you've already run the nssp pipeline. See the `nssp` directory for instructions on doing that. +First loading some libraries and reading the results from the pipeline: +```{r} +library(covidcast) +library(epidatr) +library(dplyr) +library(ggplot2) + +library(purrr) +library(tidyverse) +library(dplyr) +library(readr) +files <- list.files(here::here("nssp/receiving"), pattern="\\.csv$", full.names = TRUE) +read_row <- function(filename) { + split_name <- filename %>% + tools::file_path_sans_ext() %>% + strsplit("/") %>% `[[`(1) %>% tail(n=1) %>% + strsplit("_") %>% `[[`(1) + week_number <- split_name[[2]] + geo_type <- split_name[[3]] + col_name <- split_name[-(1:3)] %>% paste(collapse = "_") + read_csv(filename, show_col_types = FALSE) %>% + as_tibble %>% + mutate(signal = col_name, + geo_type = geo_type, + week_number = week_number) %>% + mutate(across(geo_id, factor)) %>% + rename(geo_value = geo_id, time_value = week_number) %>% + select(-missing_se, -se, -sample_size, -missing_sample_size) %>% + return +} +res <- map(files, read_row) +nssp_data <- bind_rows(res) +nssp_state <- nssp_data %>% + filter(geo_type == "state") %>% + mutate(time_value = epidatr:::parse_api_week(time_value)) %>% + as_epi_df(time_type = "week", geo_type = "state") %>% + select(-missing_val, -geo_type) +unique(nssp_data$time_value) +``` +And epidatr versions of hhs for comparison +```{r} +library(epidatr) +eval_time <- epidatr::epirange(from = "2020-01-01", to = Sys.Date()) +fetch_args <- epidatr::fetch_args_list(return_empty = TRUE, timeout_seconds = 300) + +flu_hhs <- epidatr::pub_covidcast( + source = "hhs", + signals = "confirmed_admissions_influenza_1d_prop_7dav", + geo_type = "state", + time_type = "day", + geo_values = "*", + time_values = eval_time, + fetch_args = fetch_args + ) %>% + select(-signal, -source, - time_type) + +covid_hhs <- epidatr::pub_covidcast( + source = "hhs", + signals = "confirmed_admissions_covid_1d_prop_7dav", + geo_type = "state", + time_type = "day", + geo_values = "*", + time_values = eval_time, + fetch_args = fetch_args + ) %>% + select(-signal, -source, - time_type) + + +nchs <- epidatr::pub_covidcast( + source = "nchs-mortality", + signals = "deaths_allcause_incidence_num", + geo_type = "state", + time_type = "week", + geo_values = "*", + time_values = epidatr::epirange(from = "202001", to = "202418"), + fetch_args = epidatr::fetch_args_list(return_empty = TRUE, timeout_seconds = 300) + ) +``` +# Flu +```{r} +library(epiprocess) +nssp_flu_state <- nssp_state %>% filter(signal == "pct_ed_visits_influenza") %>% select(-signal) %>% drop_na %>% rename(pct_flu_visits = val) %>% as_epi_df(time_type = "week", geo_type = "state") +week_starts <- nssp_flu_state$time_value %>% unique() +flu_hhs_weekly <- flu_hhs %>% select(geo_value, time_value, value) %>% filter(time_value %in% week_starts) %>% rename(conf_admission = value) %>% drop_na %>% as_epi_df(time_type = "week", geo_type = "state") +joined <- nssp_flu_state %>% left_join(flu_hhs_weekly) +``` + +After the necessary joining, lets look at the average correlations +```{r} +cor(joined$pct_flu_visits, joined$conf_admission, method = "spearman") +``` +So the overall correlation is pretty high. + +## Correlations sliced by state +```{r} +correlations_space_flu <- epi_cor(joined, pct_flu_visits, conf_admission, cor_by = "geo_value", use = "complete.obs", method = "spearman") +library(maps) # For map data +states_map <- map_data("state") +mapped <- states_map %>% as_tibble %>% mutate(geo_value = setNames(tolower(state.abb), tolower(state.name))[region]) %>% right_join(correlations_space_flu) %>% arrange(group, order) +library(viridis) +ggplot(mapped, aes(x = long, y = lat, group = group, fill = cor)) + + geom_polygon(colour = "black") + + scale_fill_viridis(discrete=FALSE, option="viridis", limits = c(0,1)) + + coord_map("polyconic") + + labs(title = "Spearman Correlations between Flu ER visits and Flu hospital admissions") +ggsave("flu_ER_admissions_state_correlations.pdf") +``` +Over space, hospital admissions look like they're highly correlated with ER visits (which makes sense, frequently when one is admitted it is via the ER). +The lowest overall correlation is +```{r} +correlations_space_flu %>% summarize(across(where(is.numeric), .fns = list(min = min, median = median, mean = mean, std = sd, q25 = ~quantile(.,0.25), q75 = ~quantile(.,0.75), max = max))) +``` +### Lag evaluation +```{r} +library(purrr) +lags <- 0:35 + +lagged_flu_state <- map_dfr(lags, function(lag) { + epi_cor(joined, pct_flu_visits, conf_admission, cor_by = geo_value, dt1 = lag, use = "complete.obs", method = "spearman") %>% + mutate(lag = .env$lag) +}) + +lagged_flu_state %>% + group_by(lag) %>% + summarize(mean = mean(cor, na.rm = TRUE)) %>% + ggplot(aes(x = lag, y = mean)) + + geom_line() + + geom_point() + + labs(x = "Lag", y = "Mean correlation", title = "Lag comparison for state spearman correlations for flu ER and Hosp admissions") +ggsave("flu_ER_admissions_state_lag_cor.pdf") +``` +Somewhat unsurprisingly, the correlation is highest immediately afterward. +## Correlations sliced by time +```{r} +correlations_time_flu <- epi_cor(joined, pct_flu_visits, conf_admission, cor_by = "time_value", use = "complete.obs", method = "spearman") +correlations_time_flu +ggplot(correlations_time_flu, aes(x = time_value, y = cor)) + geom_line() + lims(y=c(0,1)) + labs(title = "Spearman Correlations between Flu ER visits and Flu hospital admissions") +ggsave("flu_ER_admissions_time_correlations.pdf") +``` +Strangely, sliced by time, we get significantly lower correlations +```{r} +correlations_time_flu %>% summarize(across(where(is.numeric), .fns = list(min = min, median = median, mean = mean, std = sd, q25 = ~quantile(.,0.25), q75 = ~quantile(.,0.75), max = max))) +``` +Seems like we have a Simpson's paradox adjacent result, since for any given location the signals are fairly well correlated when averaged over time, but at a given time, averaging over different locations suggests they're not very well correlated. +If the typical explanation applies, this means that there are large differences in the number of points. + +so, getting the counts: +```{r} +joined %>% group_by(geo_value) %>% count %>% arrange(n) %>% ungroup %>% summarise(across(where(is.numeric), .fns = list(min = min, max = max))) +``` +Each location has 82 + +```{r} +joined %>% group_by(time_value) %>% count %>% arrange(n) %>% ungroup %>% summarise(across(where(is.numeric), .fns = list(min = min, max = max))) +``` +# Covid +```{r} +library(epiprocess) +nssp_data %>% pull(signal) %>% unique +nssp_state <- nssp_data %>% + filter(geo_type == "state") %>% + mutate(time_value = epidatr:::parse_api_week(time_value)) %>% + as_epi_df(time_type = "week", geo_type = "state") %>% + select(-missing_val, -geo_type) +nssp_covid_state <- nssp_state %>% filter(signal == "pct_ed_visits_covid") %>% select(-signal) %>% drop_na %>% rename(pct_covid_visits = val) %>% as_epi_df(time_type = "week", geo_type = "state") +week_starts <- nssp_covid_state$time_value %>% unique() +covid_hhs_weekly <- covid_hhs %>% select(geo_value, time_value, value) %>% filter(time_value %in% week_starts) %>% rename(conf_admission = value) %>% drop_na %>% as_epi_df(time_type = "week", geo_type = "state") +joined_covid <- nssp_covid_state %>% left_join(covid_hhs_weekly) +``` + +After the necessary joining, lets look at the average correlations +```{r} +cor(joined_covid$pct_covid_visits, joined_covid$conf_admission, method = "spearman") +``` +So the overall correlation is pretty high, but lower than flu. + +## Correlations sliced by state +```{r} +correlations_space_covid <- epi_cor(joined_covid, pct_covid_visits, conf_admission, cor_by = "geo_value", use = "complete.obs", method = "spearman") +library(maps) # For map data +states_map <- map_data("state") +mapped <- states_map %>% as_tibble %>% mutate(geo_value = setNames(tolower(state.abb), tolower(state.name))[region]) %>% right_join(correlations_space_covid) %>% arrange(group, order) +library(viridis) +ggplot(mapped, aes(x = long, y = lat, group = group, fill = cor)) + + geom_polygon(colour = "black") + + scale_fill_viridis(discrete=FALSE, option="viridis", limits = c(0,1)) + + coord_map("polyconic") + + labs(title = "Spearman Correlations between covid ER visits and covid hospital admissions") +ggsave("covid_ER_admissions_state_correlations.pdf") +ggsave("covid_ER_admissions_state_correlations.png") +``` +Over space, hospital admissions look like they're highly correlated with ER visits (which makes sense, frequently when one is admitted it is via the ER). +The lowest overall correlation is +```{r} +correlations_space_covid %>% summarize(across(where(is.numeric), .fns = list(min = min, median = median, mean = mean, std = sd, q25 = ~quantile(.,0.25), q75 = ~quantile(.,0.75), max = max))) +``` +### Lag evaluation +```{r} +library(purrr) +lags <- 0:35 + +lagged_covid_state <- map_dfr(lags, function(lag) { + epi_cor(joined_covid, pct_covid_visits, conf_admission, cor_by = geo_value, dt1 = -lag, use = "complete.obs", method = "spearman") %>% + mutate(lag = .env$lag) +}) + +lagged_covid_state %>% + group_by(lag) %>% + summarize(mean = mean(cor, na.rm = TRUE)) %>% + ggplot(aes(x = lag, y = mean)) + + geom_line() + + geom_point() + + labs(x = "Lag", y = "Mean correlation", title = "Lag comparison for state spearman correlations for covid ER and Hosp admissions") +ggsave("covid_ER_admissions_state_lag_cor.pdf") +ggsave("covid_ER_admissions_state_lag_cor.png") +``` +Somewhat unsurprisingly, the correlation is highest immediately afterward, though its significantly lower than in the flu case. +## Correlations sliced by time +```{r} +correlations_time_covid <- epi_cor(joined_covid, pct_covid_visits, conf_admission, cor_by = "time_value", use = "complete.obs", method = "spearman") +correlations_time_covid +ggplot(correlations_time_covid, aes(x = time_value, y = cor)) + geom_line() + lims(y=c(0,1)) + labs(title = "Spearman Correlations between covid ER visits and covid hospital admissions") +ggsave("covid_ER_admissions_time_correlations.pdf") +ggsave("covid_ER_admissions_time_correlations.png") +``` +Strangely, sliced by time, we get significantly lower correlations, some of them are even negative +```{r} +correlations_time_covid %>% summarize(across(where(is.numeric), .fns = list(min = min, median = median, mean = mean, std = sd, q25 = ~quantile(.,0.25), q75 = ~quantile(.,0.75), max = max))) +``` +Seems like we have a Simpson's paradox adjacent result, since for any given location the signals are fairly well correlated when averaged over time, but at a given time, averaging over different locations suggests they're not very well correlated. +If the typical explanation applies, this means that there are large differences in the number of points. + +so, getting the counts: +```{r} +joined_covid %>% group_by(geo_value) %>% count %>% arrange(n) %>% ungroup %>% summarise(across(where(is.numeric), .fns = list(min = min, max = max))) +``` +Each location has 82 + +```{r} +joined_covid %>% group_by(time_value) %>% count %>% arrange(n) %>% ungroup %>% summarise(across(where(is.numeric), .fns = list(min = min, max = max))) +``` diff --git a/notebooks/nssp/covid_ER_admissions_state_correlations.pdf b/notebooks/nssp/covid_ER_admissions_state_correlations.pdf new file mode 100644 index 000000000..35f272b03 Binary files /dev/null and b/notebooks/nssp/covid_ER_admissions_state_correlations.pdf differ diff --git a/notebooks/nssp/covid_ER_admissions_state_lag_cor.pdf b/notebooks/nssp/covid_ER_admissions_state_lag_cor.pdf new file mode 100644 index 000000000..86852e048 Binary files /dev/null and b/notebooks/nssp/covid_ER_admissions_state_lag_cor.pdf differ diff --git a/notebooks/nssp/covid_ER_admissions_time_correlations.pdf b/notebooks/nssp/covid_ER_admissions_time_correlations.pdf new file mode 100644 index 000000000..be131cd92 Binary files /dev/null and b/notebooks/nssp/covid_ER_admissions_time_correlations.pdf differ diff --git a/notebooks/nssp/flu_ER_admissions_state_correlations.pdf b/notebooks/nssp/flu_ER_admissions_state_correlations.pdf new file mode 100644 index 000000000..56e04209d Binary files /dev/null and b/notebooks/nssp/flu_ER_admissions_state_correlations.pdf differ diff --git a/notebooks/nssp/flu_ER_admissions_state_lag_cor.pdf b/notebooks/nssp/flu_ER_admissions_state_lag_cor.pdf new file mode 100644 index 000000000..d08c84c32 Binary files /dev/null and b/notebooks/nssp/flu_ER_admissions_state_lag_cor.pdf differ diff --git a/notebooks/nssp/flu_ER_admissions_time_correlations.pdf b/notebooks/nssp/flu_ER_admissions_time_correlations.pdf new file mode 100644 index 000000000..c00c1479e Binary files /dev/null and b/notebooks/nssp/flu_ER_admissions_time_correlations.pdf differ diff --git a/notebooks/renv.lock b/notebooks/renv.lock new file mode 100644 index 000000000..dd103cd87 --- /dev/null +++ b/notebooks/renv.lock @@ -0,0 +1,1367 @@ +{ + "R": { + "Version": "4.4.0", + "Repositories": [ + { + "Name": "RSPM", + "URL": "https://packagemanager.posit.co/all/latest" + } + ] + }, + "Packages": { + "DBI": { + "Package": "DBI", + "Version": "1.2.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "methods" + ], + "Hash": "164809cd72e1d5160b4cb3aa57f510fe" + }, + "KernSmooth": { + "Package": "KernSmooth", + "Version": "2.23-22", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "stats" + ], + "Hash": "2fecebc3047322fa5930f74fae5de70f" + }, + "MASS": { + "Package": "MASS", + "Version": "7.3-60.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "grDevices", + "graphics", + "methods", + "stats", + "utils" + ], + "Hash": "2f342c46163b0b54d7b64d1f798e2c78" + }, + "MMWRweek": { + "Package": "MMWRweek", + "Version": "0.1.3", + "Source": "Repository", + "Repository": "RSPM", + "Hash": "4329e57e2536e12afe479e8571416dbc" + }, + "Matrix": { + "Package": "Matrix", + "Version": "1.7-0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "grDevices", + "graphics", + "grid", + "lattice", + "methods", + "stats", + "utils" + ], + "Hash": "1920b2f11133b12350024297d8a4ff4a" + }, + "R6": { + "Package": "R6", + "Version": "2.5.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "470851b6d5d0ac559e9d01bb352b4021" + }, + "RColorBrewer": { + "Package": "RColorBrewer", + "Version": "1.1-3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "45f0398006e83a5b10b72a90663d8d8c" + }, + "Rcpp": { + "Package": "Rcpp", + "Version": "1.0.12", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "methods", + "utils" + ], + "Hash": "5ea2700d21e038ace58269ecdbeb9ec0" + }, + "askpass": { + "Package": "askpass", + "Version": "1.2.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "sys" + ], + "Hash": "cad6cf7f1d5f6e906700b9d3e718c796" + }, + "backports": { + "Package": "backports", + "Version": "1.4.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "c39fbec8a30d23e721980b8afb31984c" + }, + "base64enc": { + "Package": "base64enc", + "Version": "0.1-3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "543776ae6848fde2f48ff3816d0628bc" + }, + "bit": { + "Package": "bit", + "Version": "4.0.5", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "d242abec29412ce988848d0294b208fd" + }, + "bit64": { + "Package": "bit64", + "Version": "4.0.5", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "bit", + "methods", + "stats", + "utils" + ], + "Hash": "9fe98599ca456d6552421db0d6772d8f" + }, + "bslib": { + "Package": "bslib", + "Version": "0.7.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "base64enc", + "cachem", + "fastmap", + "grDevices", + "htmltools", + "jquerylib", + "jsonlite", + "lifecycle", + "memoise", + "mime", + "rlang", + "sass" + ], + "Hash": "8644cc53f43828f19133548195d7e59e" + }, + "cachem": { + "Package": "cachem", + "Version": "1.0.8", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "fastmap", + "rlang" + ], + "Hash": "c35768291560ce302c0a6589f92e837d" + }, + "checkmate": { + "Package": "checkmate", + "Version": "2.3.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "backports", + "utils" + ], + "Hash": "c01cab1cb0f9125211a6fc99d540e315" + }, + "class": { + "Package": "class", + "Version": "7.3-22", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "MASS", + "R", + "stats", + "utils" + ], + "Hash": "f91f6b29f38b8c280f2b9477787d4bb2" + }, + "classInt": { + "Package": "classInt", + "Version": "0.4-10", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "KernSmooth", + "R", + "class", + "e1071", + "grDevices", + "graphics", + "stats" + ], + "Hash": "f5a40793b1ae463a7ffb3902a95bf864" + }, + "cli": { + "Package": "cli", + "Version": "3.6.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "utils" + ], + "Hash": "1216ac65ac55ec0058a6f75d7ca0fd52" + }, + "clipr": { + "Package": "clipr", + "Version": "0.8.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "utils" + ], + "Hash": "3f038e5ac7f41d4ac41ce658c85e3042" + }, + "colorspace": { + "Package": "colorspace", + "Version": "2.1-0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "grDevices", + "graphics", + "methods", + "stats" + ], + "Hash": "f20c47fd52fae58b4e377c37bb8c335b" + }, + "covidcast": { + "Package": "covidcast", + "Version": "0.5.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "MMWRweek", + "R", + "dplyr", + "ggplot2", + "grDevices", + "httr", + "purrr", + "rlang", + "sf", + "tidyr", + "xml2" + ], + "Hash": "ee88255e014ff787bd3db3f4735fb24a" + }, + "cpp11": { + "Package": "cpp11", + "Version": "0.4.7", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "5a295d7d963cc5035284dcdbaf334f4e" + }, + "crayon": { + "Package": "crayon", + "Version": "1.5.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "grDevices", + "methods", + "utils" + ], + "Hash": "e8a1e41acf02548751f45c718d55aa6a" + }, + "credentials": { + "Package": "credentials", + "Version": "2.0.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "askpass", + "curl", + "jsonlite", + "openssl", + "sys" + ], + "Hash": "c7844b32098dcbd1c59cbd8dddb4ecc6" + }, + "curl": { + "Package": "curl", + "Version": "5.2.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "411ca2c03b1ce5f548345d2fc2685f7a" + }, + "desc": { + "Package": "desc", + "Version": "1.4.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "R6", + "cli", + "utils" + ], + "Hash": "99b79fcbd6c4d1ce087f5c5c758b384f" + }, + "digest": { + "Package": "digest", + "Version": "0.6.35", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "utils" + ], + "Hash": "698ece7ba5a4fa4559e3d537e7ec3d31" + }, + "dplyr": { + "Package": "dplyr", + "Version": "1.1.4", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "R6", + "cli", + "generics", + "glue", + "lifecycle", + "magrittr", + "methods", + "pillar", + "rlang", + "tibble", + "tidyselect", + "utils", + "vctrs" + ], + "Hash": "fedd9d00c2944ff00a0e2696ccf048ec" + }, + "e1071": { + "Package": "e1071", + "Version": "1.7-14", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "class", + "grDevices", + "graphics", + "methods", + "proxy", + "stats", + "utils" + ], + "Hash": "4ef372b716824753719a8a38b258442d" + }, + "epidatr": { + "Package": "epidatr", + "Version": "1.1.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "MMWRweek", + "R", + "cachem", + "checkmate", + "cli", + "glue", + "httr", + "jsonlite", + "magrittr", + "openssl", + "purrr", + "rappdirs", + "readr", + "tibble", + "usethis", + "xml2" + ], + "Hash": "cf6f60be321bfd49298e27717be8c2b2" + }, + "evaluate": { + "Package": "evaluate", + "Version": "0.23", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "methods" + ], + "Hash": "daf4a1246be12c1fa8c7705a0935c1a0" + }, + "fansi": { + "Package": "fansi", + "Version": "1.0.6", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "grDevices", + "utils" + ], + "Hash": "962174cf2aeb5b9eea581522286a911f" + }, + "farver": { + "Package": "farver", + "Version": "2.1.1", + "Source": "Repository", + "Repository": "RSPM", + "Hash": "8106d78941f34855c440ddb946b8f7a5" + }, + "fastmap": { + "Package": "fastmap", + "Version": "1.1.1", + "Source": "Repository", + "Repository": "RSPM", + "Hash": "f7736a18de97dea803bde0a2daaafb27" + }, + "fontawesome": { + "Package": "fontawesome", + "Version": "0.5.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "htmltools", + "rlang" + ], + "Hash": "c2efdd5f0bcd1ea861c2d4e2a883a67d" + }, + "fs": { + "Package": "fs", + "Version": "1.6.4", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "methods" + ], + "Hash": "15aeb8c27f5ea5161f9f6a641fafd93a" + }, + "generics": { + "Package": "generics", + "Version": "0.1.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "methods" + ], + "Hash": "15e9634c0fcd294799e9b2e929ed1b86" + }, + "gert": { + "Package": "gert", + "Version": "2.0.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "askpass", + "credentials", + "openssl", + "rstudioapi", + "sys", + "zip" + ], + "Hash": "f70d3fe2d9e7654213a946963d1591eb" + }, + "ggplot2": { + "Package": "ggplot2", + "Version": "3.5.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "MASS", + "R", + "cli", + "glue", + "grDevices", + "grid", + "gtable", + "isoband", + "lifecycle", + "mgcv", + "rlang", + "scales", + "stats", + "tibble", + "vctrs", + "withr" + ], + "Hash": "44c6a2f8202d5b7e878ea274b1092426" + }, + "gh": { + "Package": "gh", + "Version": "1.4.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "cli", + "gitcreds", + "glue", + "httr2", + "ini", + "jsonlite", + "lifecycle", + "rlang" + ], + "Hash": "fbbbc48eba7a6626a08bb365e44b563b" + }, + "gitcreds": { + "Package": "gitcreds", + "Version": "0.1.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "ab08ac61f3e1be454ae21911eb8bc2fe" + }, + "glue": { + "Package": "glue", + "Version": "1.7.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "methods" + ], + "Hash": "e0b3a53876554bd45879e596cdb10a52" + }, + "gtable": { + "Package": "gtable", + "Version": "0.3.5", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "cli", + "glue", + "grid", + "lifecycle", + "rlang" + ], + "Hash": "e18861963cbc65a27736e02b3cd3c4a0" + }, + "highr": { + "Package": "highr", + "Version": "0.10", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "xfun" + ], + "Hash": "06230136b2d2b9ba5805e1963fa6e890" + }, + "hms": { + "Package": "hms", + "Version": "1.1.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "lifecycle", + "methods", + "pkgconfig", + "rlang", + "vctrs" + ], + "Hash": "b59377caa7ed00fa41808342002138f9" + }, + "htmltools": { + "Package": "htmltools", + "Version": "0.5.8.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "base64enc", + "digest", + "fastmap", + "grDevices", + "rlang", + "utils" + ], + "Hash": "81d371a9cc60640e74e4ab6ac46dcedc" + }, + "httr": { + "Package": "httr", + "Version": "1.4.7", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "R6", + "curl", + "jsonlite", + "mime", + "openssl" + ], + "Hash": "ac107251d9d9fd72f0ca8049988f1d7f" + }, + "httr2": { + "Package": "httr2", + "Version": "1.0.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "R6", + "cli", + "curl", + "glue", + "lifecycle", + "magrittr", + "openssl", + "rappdirs", + "rlang", + "vctrs", + "withr" + ], + "Hash": "03d741c92fda96d98c3a3f22494e3b4a" + }, + "ini": { + "Package": "ini", + "Version": "0.3.1", + "Source": "Repository", + "Repository": "RSPM", + "Hash": "6154ec2223172bce8162d4153cda21f7" + }, + "isoband": { + "Package": "isoband", + "Version": "0.2.7", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "grid", + "utils" + ], + "Hash": "0080607b4a1a7b28979aecef976d8bc2" + }, + "jquerylib": { + "Package": "jquerylib", + "Version": "0.1.4", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "htmltools" + ], + "Hash": "5aab57a3bd297eee1c1d862735972182" + }, + "jsonlite": { + "Package": "jsonlite", + "Version": "1.8.8", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "methods" + ], + "Hash": "e1b9c55281c5adc4dd113652d9e26768" + }, + "knitr": { + "Package": "knitr", + "Version": "1.46", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "evaluate", + "highr", + "methods", + "tools", + "xfun", + "yaml" + ], + "Hash": "6e008ab1d696a5283c79765fa7b56b47" + }, + "labeling": { + "Package": "labeling", + "Version": "0.4.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "graphics", + "stats" + ], + "Hash": "b64ec208ac5bc1852b285f665d6368b3" + }, + "lattice": { + "Package": "lattice", + "Version": "0.22-6", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "grDevices", + "graphics", + "grid", + "stats", + "utils" + ], + "Hash": "cc5ac1ba4c238c7ca9fa6a87ca11a7e2" + }, + "lifecycle": { + "Package": "lifecycle", + "Version": "1.0.4", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "cli", + "glue", + "rlang" + ], + "Hash": "b8552d117e1b808b09a832f589b79035" + }, + "magrittr": { + "Package": "magrittr", + "Version": "2.0.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "7ce2733a9826b3aeb1775d56fd305472" + }, + "memoise": { + "Package": "memoise", + "Version": "2.0.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "cachem", + "rlang" + ], + "Hash": "e2817ccf4a065c5d9d7f2cfbe7c1d78c" + }, + "mgcv": { + "Package": "mgcv", + "Version": "1.9-1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "Matrix", + "R", + "graphics", + "methods", + "nlme", + "splines", + "stats", + "utils" + ], + "Hash": "110ee9d83b496279960e162ac97764ce" + }, + "mime": { + "Package": "mime", + "Version": "0.12", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "tools" + ], + "Hash": "18e9c28c1d3ca1560ce30658b22ce104" + }, + "munsell": { + "Package": "munsell", + "Version": "0.5.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "colorspace", + "methods" + ], + "Hash": "4fd8900853b746af55b81fda99da7695" + }, + "nlme": { + "Package": "nlme", + "Version": "3.1-164", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "graphics", + "lattice", + "stats", + "utils" + ], + "Hash": "a623a2239e642806158bc4dc3f51565d" + }, + "openssl": { + "Package": "openssl", + "Version": "2.1.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "askpass" + ], + "Hash": "ea2475b073243d9d338aa8f086ce973e" + }, + "pillar": { + "Package": "pillar", + "Version": "1.9.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "cli", + "fansi", + "glue", + "lifecycle", + "rlang", + "utf8", + "utils", + "vctrs" + ], + "Hash": "15da5a8412f317beeee6175fbc76f4bb" + }, + "pkgconfig": { + "Package": "pkgconfig", + "Version": "2.0.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "utils" + ], + "Hash": "01f28d4278f15c76cddbea05899c5d6f" + }, + "prettyunits": { + "Package": "prettyunits", + "Version": "1.2.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "6b01fc98b1e86c4f705ce9dcfd2f57c7" + }, + "progress": { + "Package": "progress", + "Version": "1.2.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "R6", + "crayon", + "hms", + "prettyunits" + ], + "Hash": "f4625e061cb2865f111b47ff163a5ca6" + }, + "proxy": { + "Package": "proxy", + "Version": "0.4-27", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "stats", + "utils" + ], + "Hash": "e0ef355c12942cf7a6b91a6cfaea8b3e" + }, + "purrr": { + "Package": "purrr", + "Version": "1.0.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "cli", + "lifecycle", + "magrittr", + "rlang", + "vctrs" + ], + "Hash": "1cba04a4e9414bdefc9dcaa99649a8dc" + }, + "rappdirs": { + "Package": "rappdirs", + "Version": "0.3.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "5e3c5dc0b071b21fa128676560dbe94d" + }, + "readr": { + "Package": "readr", + "Version": "2.1.5", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "R6", + "cli", + "clipr", + "cpp11", + "crayon", + "hms", + "lifecycle", + "methods", + "rlang", + "tibble", + "tzdb", + "utils", + "vroom" + ], + "Hash": "9de96463d2117f6ac49980577939dfb3" + }, + "renv": { + "Package": "renv", + "Version": "1.0.7", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "utils" + ], + "Hash": "397b7b2a265bc5a7a06852524dabae20" + }, + "rlang": { + "Package": "rlang", + "Version": "1.1.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "utils" + ], + "Hash": "42548638fae05fd9a9b5f3f437fbbbe2" + }, + "rmarkdown": { + "Package": "rmarkdown", + "Version": "2.26", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "bslib", + "evaluate", + "fontawesome", + "htmltools", + "jquerylib", + "jsonlite", + "knitr", + "methods", + "tinytex", + "tools", + "utils", + "xfun", + "yaml" + ], + "Hash": "9b148e7f95d33aac01f31282d49e4f44" + }, + "rprojroot": { + "Package": "rprojroot", + "Version": "2.0.4", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "4c8415e0ec1e29f3f4f6fc108bef0144" + }, + "rstudioapi": { + "Package": "rstudioapi", + "Version": "0.16.0", + "Source": "Repository", + "Repository": "RSPM", + "Hash": "96710351d642b70e8f02ddeb237c46a7" + }, + "s2": { + "Package": "s2", + "Version": "1.1.6", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "Rcpp", + "wk" + ], + "Hash": "32f7b1a15bb01ae809022960abad5363" + }, + "sass": { + "Package": "sass", + "Version": "0.4.9", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R6", + "fs", + "htmltools", + "rappdirs", + "rlang" + ], + "Hash": "d53dbfddf695303ea4ad66f86e99b95d" + }, + "scales": { + "Package": "scales", + "Version": "1.3.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "R6", + "RColorBrewer", + "cli", + "farver", + "glue", + "labeling", + "lifecycle", + "munsell", + "rlang", + "viridisLite" + ], + "Hash": "c19df082ba346b0ffa6f833e92de34d1" + }, + "sf": { + "Package": "sf", + "Version": "1.0-16", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "DBI", + "R", + "Rcpp", + "classInt", + "grDevices", + "graphics", + "grid", + "magrittr", + "methods", + "s2", + "stats", + "tools", + "units", + "utils" + ], + "Hash": "ad57b543f7c3fca05213ba78ff63df9b" + }, + "stringi": { + "Package": "stringi", + "Version": "1.8.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "stats", + "tools", + "utils" + ], + "Hash": "058aebddea264f4c99401515182e656a" + }, + "stringr": { + "Package": "stringr", + "Version": "1.5.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "cli", + "glue", + "lifecycle", + "magrittr", + "rlang", + "stringi", + "vctrs" + ], + "Hash": "960e2ae9e09656611e0b8214ad543207" + }, + "sys": { + "Package": "sys", + "Version": "3.4.2", + "Source": "Repository", + "Repository": "RSPM", + "Hash": "3a1be13d68d47a8cd0bfd74739ca1555" + }, + "tibble": { + "Package": "tibble", + "Version": "3.2.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "fansi", + "lifecycle", + "magrittr", + "methods", + "pillar", + "pkgconfig", + "rlang", + "utils", + "vctrs" + ], + "Hash": "a84e2cc86d07289b3b6f5069df7a004c" + }, + "tidyr": { + "Package": "tidyr", + "Version": "1.3.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "cli", + "cpp11", + "dplyr", + "glue", + "lifecycle", + "magrittr", + "purrr", + "rlang", + "stringr", + "tibble", + "tidyselect", + "utils", + "vctrs" + ], + "Hash": "915fb7ce036c22a6a33b5a8adb712eb1" + }, + "tidyselect": { + "Package": "tidyselect", + "Version": "1.2.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "cli", + "glue", + "lifecycle", + "rlang", + "vctrs", + "withr" + ], + "Hash": "829f27b9c4919c16b593794a6344d6c0" + }, + "tinytex": { + "Package": "tinytex", + "Version": "0.50", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "xfun" + ], + "Hash": "be7a76845222ad20adb761f462eed3ea" + }, + "tzdb": { + "Package": "tzdb", + "Version": "0.4.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "cpp11" + ], + "Hash": "f561504ec2897f4d46f0c7657e488ae1" + }, + "units": { + "Package": "units", + "Version": "0.8-5", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "Rcpp" + ], + "Hash": "119d19da480e873f72241ff6962ffd83" + }, + "usethis": { + "Package": "usethis", + "Version": "2.2.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "cli", + "clipr", + "crayon", + "curl", + "desc", + "fs", + "gert", + "gh", + "glue", + "jsonlite", + "lifecycle", + "purrr", + "rappdirs", + "rlang", + "rprojroot", + "rstudioapi", + "stats", + "utils", + "whisker", + "withr", + "yaml" + ], + "Hash": "d524fd42c517035027f866064417d7e6" + }, + "utf8": { + "Package": "utf8", + "Version": "1.2.4", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "62b65c52671e6665f803ff02954446e9" + }, + "vctrs": { + "Package": "vctrs", + "Version": "0.6.5", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "cli", + "glue", + "lifecycle", + "rlang" + ], + "Hash": "c03fa420630029418f7e6da3667aac4a" + }, + "viridisLite": { + "Package": "viridisLite", + "Version": "0.4.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "c826c7c4241b6fc89ff55aaea3fa7491" + }, + "vroom": { + "Package": "vroom", + "Version": "1.6.5", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "bit64", + "cli", + "cpp11", + "crayon", + "glue", + "hms", + "lifecycle", + "methods", + "progress", + "rlang", + "stats", + "tibble", + "tidyselect", + "tzdb", + "vctrs", + "withr" + ], + "Hash": "390f9315bc0025be03012054103d227c" + }, + "whisker": { + "Package": "whisker", + "Version": "0.4.1", + "Source": "Repository", + "Repository": "RSPM", + "Hash": "c6abfa47a46d281a7d5159d0a8891e88" + }, + "withr": { + "Package": "withr", + "Version": "3.0.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "grDevices", + "graphics" + ], + "Hash": "d31b6c62c10dcf11ec530ca6b0dd5d35" + }, + "wk": { + "Package": "wk", + "Version": "0.9.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "5d4545e140e36476f35f20d0ca87963e" + }, + "xfun": { + "Package": "xfun", + "Version": "0.43", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "grDevices", + "stats", + "tools" + ], + "Hash": "ab6371d8653ce5f2f9290f4ec7b42a8e" + }, + "xml2": { + "Package": "xml2", + "Version": "1.3.6", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "cli", + "methods", + "rlang" + ], + "Hash": "1d0336142f4cd25d8d23cd3ba7a8fb61" + }, + "yaml": { + "Package": "yaml", + "Version": "2.3.8", + "Source": "Repository", + "Repository": "RSPM", + "Hash": "29240487a071f535f5e5d5a323b7afbd" + }, + "zip": { + "Package": "zip", + "Version": "2.3.1", + "Source": "Repository", + "Repository": "RSPM", + "Hash": "fcc4bd8e6da2d2011eb64a5e5cc685ab" + } + } +} diff --git a/notebooks/renv/.gitignore b/notebooks/renv/.gitignore new file mode 100644 index 000000000..0ec0cbba2 --- /dev/null +++ b/notebooks/renv/.gitignore @@ -0,0 +1,7 @@ +library/ +local/ +cellar/ +lock/ +python/ +sandbox/ +staging/ diff --git a/notebooks/renv/activate.R b/notebooks/renv/activate.R new file mode 100644 index 000000000..d13f9932a --- /dev/null +++ b/notebooks/renv/activate.R @@ -0,0 +1,1220 @@ + +local({ + + # the requested version of renv + version <- "1.0.7" + attr(version, "sha") <- NULL + + # the project directory + project <- Sys.getenv("RENV_PROJECT") + if (!nzchar(project)) + project <- getwd() + + # use start-up diagnostics if enabled + diagnostics <- Sys.getenv("RENV_STARTUP_DIAGNOSTICS", unset = "FALSE") + if (diagnostics) { + start <- Sys.time() + profile <- tempfile("renv-startup-", fileext = ".Rprof") + utils::Rprof(profile) + on.exit({ + utils::Rprof(NULL) + elapsed <- signif(difftime(Sys.time(), start, units = "auto"), digits = 2L) + writeLines(sprintf("- renv took %s to run the autoloader.", format(elapsed))) + writeLines(sprintf("- Profile: %s", profile)) + print(utils::summaryRprof(profile)) + }, add = TRUE) + } + + # figure out whether the autoloader is enabled + enabled <- local({ + + # first, check config option + override <- getOption("renv.config.autoloader.enabled") + if (!is.null(override)) + return(override) + + # if we're being run in a context where R_LIBS is already set, + # don't load -- presumably we're being run as a sub-process and + # the parent process has already set up library paths for us + rcmd <- Sys.getenv("R_CMD", unset = NA) + rlibs <- Sys.getenv("R_LIBS", unset = NA) + if (!is.na(rlibs) && !is.na(rcmd)) + return(FALSE) + + # next, check environment variables + # TODO: prefer using the configuration one in the future + envvars <- c( + "RENV_CONFIG_AUTOLOADER_ENABLED", + "RENV_AUTOLOADER_ENABLED", + "RENV_ACTIVATE_PROJECT" + ) + + for (envvar in envvars) { + envval <- Sys.getenv(envvar, unset = NA) + if (!is.na(envval)) + return(tolower(envval) %in% c("true", "t", "1")) + } + + # enable by default + TRUE + + }) + + # bail if we're not enabled + if (!enabled) { + + # if we're not enabled, we might still need to manually load + # the user profile here + profile <- Sys.getenv("R_PROFILE_USER", unset = "~/.Rprofile") + if (file.exists(profile)) { + cfg <- Sys.getenv("RENV_CONFIG_USER_PROFILE", unset = "TRUE") + if (tolower(cfg) %in% c("true", "t", "1")) + sys.source(profile, envir = globalenv()) + } + + return(FALSE) + + } + + # avoid recursion + if (identical(getOption("renv.autoloader.running"), TRUE)) { + warning("ignoring recursive attempt to run renv autoloader") + return(invisible(TRUE)) + } + + # signal that we're loading renv during R startup + options(renv.autoloader.running = TRUE) + on.exit(options(renv.autoloader.running = NULL), add = TRUE) + + # signal that we've consented to use renv + options(renv.consent = TRUE) + + # load the 'utils' package eagerly -- this ensures that renv shims, which + # mask 'utils' packages, will come first on the search path + library(utils, lib.loc = .Library) + + # unload renv if it's already been loaded + if ("renv" %in% loadedNamespaces()) + unloadNamespace("renv") + + # load bootstrap tools + `%||%` <- function(x, y) { + if (is.null(x)) y else x + } + + catf <- function(fmt, ..., appendLF = TRUE) { + + quiet <- getOption("renv.bootstrap.quiet", default = FALSE) + if (quiet) + return(invisible()) + + msg <- sprintf(fmt, ...) + cat(msg, file = stdout(), sep = if (appendLF) "\n" else "") + + invisible(msg) + + } + + header <- function(label, + ..., + prefix = "#", + suffix = "-", + n = min(getOption("width"), 78)) + { + label <- sprintf(label, ...) + n <- max(n - nchar(label) - nchar(prefix) - 2L, 8L) + if (n <= 0) + return(paste(prefix, label)) + + tail <- paste(rep.int(suffix, n), collapse = "") + paste0(prefix, " ", label, " ", tail) + + } + + heredoc <- function(text, leave = 0) { + + # remove leading, trailing whitespace + trimmed <- gsub("^\\s*\\n|\\n\\s*$", "", text) + + # split into lines + lines <- strsplit(trimmed, "\n", fixed = TRUE)[[1L]] + + # compute common indent + indent <- regexpr("[^[:space:]]", lines) + common <- min(setdiff(indent, -1L)) - leave + paste(substring(lines, common), collapse = "\n") + + } + + startswith <- function(string, prefix) { + substring(string, 1, nchar(prefix)) == prefix + } + + bootstrap <- function(version, library) { + + friendly <- renv_bootstrap_version_friendly(version) + section <- header(sprintf("Bootstrapping renv %s", friendly)) + catf(section) + + # attempt to download renv + catf("- Downloading renv ... ", appendLF = FALSE) + withCallingHandlers( + tarball <- renv_bootstrap_download(version), + error = function(err) { + catf("FAILED") + stop("failed to download:\n", conditionMessage(err)) + } + ) + catf("OK") + on.exit(unlink(tarball), add = TRUE) + + # now attempt to install + catf("- Installing renv ... ", appendLF = FALSE) + withCallingHandlers( + status <- renv_bootstrap_install(version, tarball, library), + error = function(err) { + catf("FAILED") + stop("failed to install:\n", conditionMessage(err)) + } + ) + catf("OK") + + # add empty line to break up bootstrapping from normal output + catf("") + + return(invisible()) + } + + renv_bootstrap_tests_running <- function() { + getOption("renv.tests.running", default = FALSE) + } + + renv_bootstrap_repos <- function() { + + # get CRAN repository + cran <- getOption("renv.repos.cran", "https://cloud.r-project.org") + + # check for repos override + repos <- Sys.getenv("RENV_CONFIG_REPOS_OVERRIDE", unset = NA) + if (!is.na(repos)) { + + # check for RSPM; if set, use a fallback repository for renv + rspm <- Sys.getenv("RSPM", unset = NA) + if (identical(rspm, repos)) + repos <- c(RSPM = rspm, CRAN = cran) + + return(repos) + + } + + # check for lockfile repositories + repos <- tryCatch(renv_bootstrap_repos_lockfile(), error = identity) + if (!inherits(repos, "error") && length(repos)) + return(repos) + + # retrieve current repos + repos <- getOption("repos") + + # ensure @CRAN@ entries are resolved + repos[repos == "@CRAN@"] <- cran + + # add in renv.bootstrap.repos if set + default <- c(FALLBACK = "https://cloud.r-project.org") + extra <- getOption("renv.bootstrap.repos", default = default) + repos <- c(repos, extra) + + # remove duplicates that might've snuck in + dupes <- duplicated(repos) | duplicated(names(repos)) + repos[!dupes] + + } + + renv_bootstrap_repos_lockfile <- function() { + + lockpath <- Sys.getenv("RENV_PATHS_LOCKFILE", unset = "renv.lock") + if (!file.exists(lockpath)) + return(NULL) + + lockfile <- tryCatch(renv_json_read(lockpath), error = identity) + if (inherits(lockfile, "error")) { + warning(lockfile) + return(NULL) + } + + repos <- lockfile$R$Repositories + if (length(repos) == 0) + return(NULL) + + keys <- vapply(repos, `[[`, "Name", FUN.VALUE = character(1)) + vals <- vapply(repos, `[[`, "URL", FUN.VALUE = character(1)) + names(vals) <- keys + + return(vals) + + } + + renv_bootstrap_download <- function(version) { + + sha <- attr(version, "sha", exact = TRUE) + + methods <- if (!is.null(sha)) { + + # attempting to bootstrap a development version of renv + c( + function() renv_bootstrap_download_tarball(sha), + function() renv_bootstrap_download_github(sha) + ) + + } else { + + # attempting to bootstrap a release version of renv + c( + function() renv_bootstrap_download_tarball(version), + function() renv_bootstrap_download_cran_latest(version), + function() renv_bootstrap_download_cran_archive(version) + ) + + } + + for (method in methods) { + path <- tryCatch(method(), error = identity) + if (is.character(path) && file.exists(path)) + return(path) + } + + stop("All download methods failed") + + } + + renv_bootstrap_download_impl <- function(url, destfile) { + + mode <- "wb" + + # https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17715 + fixup <- + Sys.info()[["sysname"]] == "Windows" && + substring(url, 1L, 5L) == "file:" + + if (fixup) + mode <- "w+b" + + args <- list( + url = url, + destfile = destfile, + mode = mode, + quiet = TRUE + ) + + if ("headers" %in% names(formals(utils::download.file))) + args$headers <- renv_bootstrap_download_custom_headers(url) + + do.call(utils::download.file, args) + + } + + renv_bootstrap_download_custom_headers <- function(url) { + + headers <- getOption("renv.download.headers") + if (is.null(headers)) + return(character()) + + if (!is.function(headers)) + stopf("'renv.download.headers' is not a function") + + headers <- headers(url) + if (length(headers) == 0L) + return(character()) + + if (is.list(headers)) + headers <- unlist(headers, recursive = FALSE, use.names = TRUE) + + ok <- + is.character(headers) && + is.character(names(headers)) && + all(nzchar(names(headers))) + + if (!ok) + stop("invocation of 'renv.download.headers' did not return a named character vector") + + headers + + } + + renv_bootstrap_download_cran_latest <- function(version) { + + spec <- renv_bootstrap_download_cran_latest_find(version) + type <- spec$type + repos <- spec$repos + + baseurl <- utils::contrib.url(repos = repos, type = type) + ext <- if (identical(type, "source")) + ".tar.gz" + else if (Sys.info()[["sysname"]] == "Windows") + ".zip" + else + ".tgz" + name <- sprintf("renv_%s%s", version, ext) + url <- paste(baseurl, name, sep = "/") + + destfile <- file.path(tempdir(), name) + status <- tryCatch( + renv_bootstrap_download_impl(url, destfile), + condition = identity + ) + + if (inherits(status, "condition")) + return(FALSE) + + # report success and return + destfile + + } + + renv_bootstrap_download_cran_latest_find <- function(version) { + + # check whether binaries are supported on this system + binary <- + getOption("renv.bootstrap.binary", default = TRUE) && + !identical(.Platform$pkgType, "source") && + !identical(getOption("pkgType"), "source") && + Sys.info()[["sysname"]] %in% c("Darwin", "Windows") + + types <- c(if (binary) "binary", "source") + + # iterate over types + repositories + for (type in types) { + for (repos in renv_bootstrap_repos()) { + + # retrieve package database + db <- tryCatch( + as.data.frame( + utils::available.packages(type = type, repos = repos), + stringsAsFactors = FALSE + ), + error = identity + ) + + if (inherits(db, "error")) + next + + # check for compatible entry + entry <- db[db$Package %in% "renv" & db$Version %in% version, ] + if (nrow(entry) == 0) + next + + # found it; return spec to caller + spec <- list(entry = entry, type = type, repos = repos) + return(spec) + + } + } + + # if we got here, we failed to find renv + fmt <- "renv %s is not available from your declared package repositories" + stop(sprintf(fmt, version)) + + } + + renv_bootstrap_download_cran_archive <- function(version) { + + name <- sprintf("renv_%s.tar.gz", version) + repos <- renv_bootstrap_repos() + urls <- file.path(repos, "src/contrib/Archive/renv", name) + destfile <- file.path(tempdir(), name) + + for (url in urls) { + + status <- tryCatch( + renv_bootstrap_download_impl(url, destfile), + condition = identity + ) + + if (identical(status, 0L)) + return(destfile) + + } + + return(FALSE) + + } + + renv_bootstrap_download_tarball <- function(version) { + + # if the user has provided the path to a tarball via + # an environment variable, then use it + tarball <- Sys.getenv("RENV_BOOTSTRAP_TARBALL", unset = NA) + if (is.na(tarball)) + return() + + # allow directories + if (dir.exists(tarball)) { + name <- sprintf("renv_%s.tar.gz", version) + tarball <- file.path(tarball, name) + } + + # bail if it doesn't exist + if (!file.exists(tarball)) { + + # let the user know we weren't able to honour their request + fmt <- "- RENV_BOOTSTRAP_TARBALL is set (%s) but does not exist." + msg <- sprintf(fmt, tarball) + warning(msg) + + # bail + return() + + } + + catf("- Using local tarball '%s'.", tarball) + tarball + + } + + renv_bootstrap_download_github <- function(version) { + + enabled <- Sys.getenv("RENV_BOOTSTRAP_FROM_GITHUB", unset = "TRUE") + if (!identical(enabled, "TRUE")) + return(FALSE) + + # prepare download options + pat <- Sys.getenv("GITHUB_PAT") + if (nzchar(Sys.which("curl")) && nzchar(pat)) { + fmt <- "--location --fail --header \"Authorization: token %s\"" + extra <- sprintf(fmt, pat) + saved <- options("download.file.method", "download.file.extra") + options(download.file.method = "curl", download.file.extra = extra) + on.exit(do.call(base::options, saved), add = TRUE) + } else if (nzchar(Sys.which("wget")) && nzchar(pat)) { + fmt <- "--header=\"Authorization: token %s\"" + extra <- sprintf(fmt, pat) + saved <- options("download.file.method", "download.file.extra") + options(download.file.method = "wget", download.file.extra = extra) + on.exit(do.call(base::options, saved), add = TRUE) + } + + url <- file.path("https://api.github.com/repos/rstudio/renv/tarball", version) + name <- sprintf("renv_%s.tar.gz", version) + destfile <- file.path(tempdir(), name) + + status <- tryCatch( + renv_bootstrap_download_impl(url, destfile), + condition = identity + ) + + if (!identical(status, 0L)) + return(FALSE) + + renv_bootstrap_download_augment(destfile) + + return(destfile) + + } + + # Add Sha to DESCRIPTION. This is stop gap until #890, after which we + # can use renv::install() to fully capture metadata. + renv_bootstrap_download_augment <- function(destfile) { + sha <- renv_bootstrap_git_extract_sha1_tar(destfile) + if (is.null(sha)) { + return() + } + + # Untar + tempdir <- tempfile("renv-github-") + on.exit(unlink(tempdir, recursive = TRUE), add = TRUE) + untar(destfile, exdir = tempdir) + pkgdir <- dir(tempdir, full.names = TRUE)[[1]] + + # Modify description + desc_path <- file.path(pkgdir, "DESCRIPTION") + desc_lines <- readLines(desc_path) + remotes_fields <- c( + "RemoteType: github", + "RemoteHost: api.github.com", + "RemoteRepo: renv", + "RemoteUsername: rstudio", + "RemotePkgRef: rstudio/renv", + paste("RemoteRef: ", sha), + paste("RemoteSha: ", sha) + ) + writeLines(c(desc_lines[desc_lines != ""], remotes_fields), con = desc_path) + + # Re-tar + local({ + old <- setwd(tempdir) + on.exit(setwd(old), add = TRUE) + + tar(destfile, compression = "gzip") + }) + invisible() + } + + # Extract the commit hash from a git archive. Git archives include the SHA1 + # hash as the comment field of the tarball pax extended header + # (see https://www.kernel.org/pub/software/scm/git/docs/git-archive.html) + # For GitHub archives this should be the first header after the default one + # (512 byte) header. + renv_bootstrap_git_extract_sha1_tar <- function(bundle) { + + # open the bundle for reading + # We use gzcon for everything because (from ?gzcon) + # > Reading from a connection which does not supply a 'gzip' magic + # > header is equivalent to reading from the original connection + conn <- gzcon(file(bundle, open = "rb", raw = TRUE)) + on.exit(close(conn)) + + # The default pax header is 512 bytes long and the first pax extended header + # with the comment should be 51 bytes long + # `52 comment=` (11 chars) + 40 byte SHA1 hash + len <- 0x200 + 0x33 + res <- rawToChar(readBin(conn, "raw", n = len)[0x201:len]) + + if (grepl("^52 comment=", res)) { + sub("52 comment=", "", res) + } else { + NULL + } + } + + renv_bootstrap_install <- function(version, tarball, library) { + + # attempt to install it into project library + dir.create(library, showWarnings = FALSE, recursive = TRUE) + output <- renv_bootstrap_install_impl(library, tarball) + + # check for successful install + status <- attr(output, "status") + if (is.null(status) || identical(status, 0L)) + return(status) + + # an error occurred; report it + header <- "installation of renv failed" + lines <- paste(rep.int("=", nchar(header)), collapse = "") + text <- paste(c(header, lines, output), collapse = "\n") + stop(text) + + } + + renv_bootstrap_install_impl <- function(library, tarball) { + + # invoke using system2 so we can capture and report output + bin <- R.home("bin") + exe <- if (Sys.info()[["sysname"]] == "Windows") "R.exe" else "R" + R <- file.path(bin, exe) + + args <- c( + "--vanilla", "CMD", "INSTALL", "--no-multiarch", + "-l", shQuote(path.expand(library)), + shQuote(path.expand(tarball)) + ) + + system2(R, args, stdout = TRUE, stderr = TRUE) + + } + + renv_bootstrap_platform_prefix <- function() { + + # construct version prefix + version <- paste(R.version$major, R.version$minor, sep = ".") + prefix <- paste("R", numeric_version(version)[1, 1:2], sep = "-") + + # include SVN revision for development versions of R + # (to avoid sharing platform-specific artefacts with released versions of R) + devel <- + identical(R.version[["status"]], "Under development (unstable)") || + identical(R.version[["nickname"]], "Unsuffered Consequences") + + if (devel) + prefix <- paste(prefix, R.version[["svn rev"]], sep = "-r") + + # build list of path components + components <- c(prefix, R.version$platform) + + # include prefix if provided by user + prefix <- renv_bootstrap_platform_prefix_impl() + if (!is.na(prefix) && nzchar(prefix)) + components <- c(prefix, components) + + # build prefix + paste(components, collapse = "/") + + } + + renv_bootstrap_platform_prefix_impl <- function() { + + # if an explicit prefix has been supplied, use it + prefix <- Sys.getenv("RENV_PATHS_PREFIX", unset = NA) + if (!is.na(prefix)) + return(prefix) + + # if the user has requested an automatic prefix, generate it + auto <- Sys.getenv("RENV_PATHS_PREFIX_AUTO", unset = NA) + if (is.na(auto) && getRversion() >= "4.4.0") + auto <- "TRUE" + + if (auto %in% c("TRUE", "True", "true", "1")) + return(renv_bootstrap_platform_prefix_auto()) + + # empty string on failure + "" + + } + + renv_bootstrap_platform_prefix_auto <- function() { + + prefix <- tryCatch(renv_bootstrap_platform_os(), error = identity) + if (inherits(prefix, "error") || prefix %in% "unknown") { + + msg <- paste( + "failed to infer current operating system", + "please file a bug report at https://github.com/rstudio/renv/issues", + sep = "; " + ) + + warning(msg) + + } + + prefix + + } + + renv_bootstrap_platform_os <- function() { + + sysinfo <- Sys.info() + sysname <- sysinfo[["sysname"]] + + # handle Windows + macOS up front + if (sysname == "Windows") + return("windows") + else if (sysname == "Darwin") + return("macos") + + # check for os-release files + for (file in c("/etc/os-release", "/usr/lib/os-release")) + if (file.exists(file)) + return(renv_bootstrap_platform_os_via_os_release(file, sysinfo)) + + # check for redhat-release files + if (file.exists("/etc/redhat-release")) + return(renv_bootstrap_platform_os_via_redhat_release()) + + "unknown" + + } + + renv_bootstrap_platform_os_via_os_release <- function(file, sysinfo) { + + # read /etc/os-release + release <- utils::read.table( + file = file, + sep = "=", + quote = c("\"", "'"), + col.names = c("Key", "Value"), + comment.char = "#", + stringsAsFactors = FALSE + ) + + vars <- as.list(release$Value) + names(vars) <- release$Key + + # get os name + os <- tolower(sysinfo[["sysname"]]) + + # read id + id <- "unknown" + for (field in c("ID", "ID_LIKE")) { + if (field %in% names(vars) && nzchar(vars[[field]])) { + id <- vars[[field]] + break + } + } + + # read version + version <- "unknown" + for (field in c("UBUNTU_CODENAME", "VERSION_CODENAME", "VERSION_ID", "BUILD_ID")) { + if (field %in% names(vars) && nzchar(vars[[field]])) { + version <- vars[[field]] + break + } + } + + # join together + paste(c(os, id, version), collapse = "-") + + } + + renv_bootstrap_platform_os_via_redhat_release <- function() { + + # read /etc/redhat-release + contents <- readLines("/etc/redhat-release", warn = FALSE) + + # infer id + id <- if (grepl("centos", contents, ignore.case = TRUE)) + "centos" + else if (grepl("redhat", contents, ignore.case = TRUE)) + "redhat" + else + "unknown" + + # try to find a version component (very hacky) + version <- "unknown" + + parts <- strsplit(contents, "[[:space:]]")[[1L]] + for (part in parts) { + + nv <- tryCatch(numeric_version(part), error = identity) + if (inherits(nv, "error")) + next + + version <- nv[1, 1] + break + + } + + paste(c("linux", id, version), collapse = "-") + + } + + renv_bootstrap_library_root_name <- function(project) { + + # use project name as-is if requested + asis <- Sys.getenv("RENV_PATHS_LIBRARY_ROOT_ASIS", unset = "FALSE") + if (asis) + return(basename(project)) + + # otherwise, disambiguate based on project's path + id <- substring(renv_bootstrap_hash_text(project), 1L, 8L) + paste(basename(project), id, sep = "-") + + } + + renv_bootstrap_library_root <- function(project) { + + prefix <- renv_bootstrap_profile_prefix() + + path <- Sys.getenv("RENV_PATHS_LIBRARY", unset = NA) + if (!is.na(path)) + return(paste(c(path, prefix), collapse = "/")) + + path <- renv_bootstrap_library_root_impl(project) + if (!is.null(path)) { + name <- renv_bootstrap_library_root_name(project) + return(paste(c(path, prefix, name), collapse = "/")) + } + + renv_bootstrap_paths_renv("library", project = project) + + } + + renv_bootstrap_library_root_impl <- function(project) { + + root <- Sys.getenv("RENV_PATHS_LIBRARY_ROOT", unset = NA) + if (!is.na(root)) + return(root) + + type <- renv_bootstrap_project_type(project) + if (identical(type, "package")) { + userdir <- renv_bootstrap_user_dir() + return(file.path(userdir, "library")) + } + + } + + renv_bootstrap_validate_version <- function(version, description = NULL) { + + # resolve description file + # + # avoid passing lib.loc to `packageDescription()` below, since R will + # use the loaded version of the package by default anyhow. note that + # this function should only be called after 'renv' is loaded + # https://github.com/rstudio/renv/issues/1625 + description <- description %||% packageDescription("renv") + + # check whether requested version 'version' matches loaded version of renv + sha <- attr(version, "sha", exact = TRUE) + valid <- if (!is.null(sha)) + renv_bootstrap_validate_version_dev(sha, description) + else + renv_bootstrap_validate_version_release(version, description) + + if (valid) + return(TRUE) + + # the loaded version of renv doesn't match the requested version; + # give the user instructions on how to proceed + dev <- identical(description[["RemoteType"]], "github") + remote <- if (dev) + paste("rstudio/renv", description[["RemoteSha"]], sep = "@") + else + paste("renv", description[["Version"]], sep = "@") + + # display both loaded version + sha if available + friendly <- renv_bootstrap_version_friendly( + version = description[["Version"]], + sha = if (dev) description[["RemoteSha"]] + ) + + fmt <- heredoc(" + renv %1$s was loaded from project library, but this project is configured to use renv %2$s. + - Use `renv::record(\"%3$s\")` to record renv %1$s in the lockfile. + - Use `renv::restore(packages = \"renv\")` to install renv %2$s into the project library. + ") + catf(fmt, friendly, renv_bootstrap_version_friendly(version), remote) + + FALSE + + } + + renv_bootstrap_validate_version_dev <- function(version, description) { + expected <- description[["RemoteSha"]] + is.character(expected) && startswith(expected, version) + } + + renv_bootstrap_validate_version_release <- function(version, description) { + expected <- description[["Version"]] + is.character(expected) && identical(expected, version) + } + + renv_bootstrap_hash_text <- function(text) { + + hashfile <- tempfile("renv-hash-") + on.exit(unlink(hashfile), add = TRUE) + + writeLines(text, con = hashfile) + tools::md5sum(hashfile) + + } + + renv_bootstrap_load <- function(project, libpath, version) { + + # try to load renv from the project library + if (!requireNamespace("renv", lib.loc = libpath, quietly = TRUE)) + return(FALSE) + + # warn if the version of renv loaded does not match + renv_bootstrap_validate_version(version) + + # execute renv load hooks, if any + hooks <- getHook("renv::autoload") + for (hook in hooks) + if (is.function(hook)) + tryCatch(hook(), error = warnify) + + # load the project + renv::load(project) + + TRUE + + } + + renv_bootstrap_profile_load <- function(project) { + + # if RENV_PROFILE is already set, just use that + profile <- Sys.getenv("RENV_PROFILE", unset = NA) + if (!is.na(profile) && nzchar(profile)) + return(profile) + + # check for a profile file (nothing to do if it doesn't exist) + path <- renv_bootstrap_paths_renv("profile", profile = FALSE, project = project) + if (!file.exists(path)) + return(NULL) + + # read the profile, and set it if it exists + contents <- readLines(path, warn = FALSE) + if (length(contents) == 0L) + return(NULL) + + # set RENV_PROFILE + profile <- contents[[1L]] + if (!profile %in% c("", "default")) + Sys.setenv(RENV_PROFILE = profile) + + profile + + } + + renv_bootstrap_profile_prefix <- function() { + profile <- renv_bootstrap_profile_get() + if (!is.null(profile)) + return(file.path("profiles", profile, "renv")) + } + + renv_bootstrap_profile_get <- function() { + profile <- Sys.getenv("RENV_PROFILE", unset = "") + renv_bootstrap_profile_normalize(profile) + } + + renv_bootstrap_profile_set <- function(profile) { + profile <- renv_bootstrap_profile_normalize(profile) + if (is.null(profile)) + Sys.unsetenv("RENV_PROFILE") + else + Sys.setenv(RENV_PROFILE = profile) + } + + renv_bootstrap_profile_normalize <- function(profile) { + + if (is.null(profile) || profile %in% c("", "default")) + return(NULL) + + profile + + } + + renv_bootstrap_path_absolute <- function(path) { + + substr(path, 1L, 1L) %in% c("~", "/", "\\") || ( + substr(path, 1L, 1L) %in% c(letters, LETTERS) && + substr(path, 2L, 3L) %in% c(":/", ":\\") + ) + + } + + renv_bootstrap_paths_renv <- function(..., profile = TRUE, project = NULL) { + renv <- Sys.getenv("RENV_PATHS_RENV", unset = "renv") + root <- if (renv_bootstrap_path_absolute(renv)) NULL else project + prefix <- if (profile) renv_bootstrap_profile_prefix() + components <- c(root, renv, prefix, ...) + paste(components, collapse = "/") + } + + renv_bootstrap_project_type <- function(path) { + + descpath <- file.path(path, "DESCRIPTION") + if (!file.exists(descpath)) + return("unknown") + + desc <- tryCatch( + read.dcf(descpath, all = TRUE), + error = identity + ) + + if (inherits(desc, "error")) + return("unknown") + + type <- desc$Type + if (!is.null(type)) + return(tolower(type)) + + package <- desc$Package + if (!is.null(package)) + return("package") + + "unknown" + + } + + renv_bootstrap_user_dir <- function() { + dir <- renv_bootstrap_user_dir_impl() + path.expand(chartr("\\", "/", dir)) + } + + renv_bootstrap_user_dir_impl <- function() { + + # use local override if set + override <- getOption("renv.userdir.override") + if (!is.null(override)) + return(override) + + # use R_user_dir if available + tools <- asNamespace("tools") + if (is.function(tools$R_user_dir)) + return(tools$R_user_dir("renv", "cache")) + + # try using our own backfill for older versions of R + envvars <- c("R_USER_CACHE_DIR", "XDG_CACHE_HOME") + for (envvar in envvars) { + root <- Sys.getenv(envvar, unset = NA) + if (!is.na(root)) + return(file.path(root, "R/renv")) + } + + # use platform-specific default fallbacks + if (Sys.info()[["sysname"]] == "Windows") + file.path(Sys.getenv("LOCALAPPDATA"), "R/cache/R/renv") + else if (Sys.info()[["sysname"]] == "Darwin") + "~/Library/Caches/org.R-project.R/R/renv" + else + "~/.cache/R/renv" + + } + + renv_bootstrap_version_friendly <- function(version, shafmt = NULL, sha = NULL) { + sha <- sha %||% attr(version, "sha", exact = TRUE) + parts <- c(version, sprintf(shafmt %||% " [sha: %s]", substring(sha, 1L, 7L))) + paste(parts, collapse = "") + } + + renv_bootstrap_exec <- function(project, libpath, version) { + if (!renv_bootstrap_load(project, libpath, version)) + renv_bootstrap_run(version, libpath) + } + + renv_bootstrap_run <- function(version, libpath) { + + # perform bootstrap + bootstrap(version, libpath) + + # exit early if we're just testing bootstrap + if (!is.na(Sys.getenv("RENV_BOOTSTRAP_INSTALL_ONLY", unset = NA))) + return(TRUE) + + # try again to load + if (requireNamespace("renv", lib.loc = libpath, quietly = TRUE)) { + return(renv::load(project = getwd())) + } + + # failed to download or load renv; warn the user + msg <- c( + "Failed to find an renv installation: the project will not be loaded.", + "Use `renv::activate()` to re-initialize the project." + ) + + warning(paste(msg, collapse = "\n"), call. = FALSE) + + } + + renv_json_read <- function(file = NULL, text = NULL) { + + jlerr <- NULL + + # if jsonlite is loaded, use that instead + if ("jsonlite" %in% loadedNamespaces()) { + + json <- tryCatch(renv_json_read_jsonlite(file, text), error = identity) + if (!inherits(json, "error")) + return(json) + + jlerr <- json + + } + + # otherwise, fall back to the default JSON reader + json <- tryCatch(renv_json_read_default(file, text), error = identity) + if (!inherits(json, "error")) + return(json) + + # report an error + if (!is.null(jlerr)) + stop(jlerr) + else + stop(json) + + } + + renv_json_read_jsonlite <- function(file = NULL, text = NULL) { + text <- paste(text %||% readLines(file, warn = FALSE), collapse = "\n") + jsonlite::fromJSON(txt = text, simplifyVector = FALSE) + } + + renv_json_read_default <- function(file = NULL, text = NULL) { + + # find strings in the JSON + text <- paste(text %||% readLines(file, warn = FALSE), collapse = "\n") + pattern <- '["](?:(?:\\\\.)|(?:[^"\\\\]))*?["]' + locs <- gregexpr(pattern, text, perl = TRUE)[[1]] + + # if any are found, replace them with placeholders + replaced <- text + strings <- character() + replacements <- character() + + if (!identical(c(locs), -1L)) { + + # get the string values + starts <- locs + ends <- locs + attr(locs, "match.length") - 1L + strings <- substring(text, starts, ends) + + # only keep those requiring escaping + strings <- grep("[[\\]{}:]", strings, perl = TRUE, value = TRUE) + + # compute replacements + replacements <- sprintf('"\032%i\032"', seq_along(strings)) + + # replace the strings + mapply(function(string, replacement) { + replaced <<- sub(string, replacement, replaced, fixed = TRUE) + }, strings, replacements) + + } + + # transform the JSON into something the R parser understands + transformed <- replaced + transformed <- gsub("{}", "`names<-`(list(), character())", transformed, fixed = TRUE) + transformed <- gsub("[[{]", "list(", transformed, perl = TRUE) + transformed <- gsub("[]}]", ")", transformed, perl = TRUE) + transformed <- gsub(":", "=", transformed, fixed = TRUE) + text <- paste(transformed, collapse = "\n") + + # parse it + json <- parse(text = text, keep.source = FALSE, srcfile = NULL)[[1L]] + + # construct map between source strings, replaced strings + map <- as.character(parse(text = strings)) + names(map) <- as.character(parse(text = replacements)) + + # convert to list + map <- as.list(map) + + # remap strings in object + remapped <- renv_json_read_remap(json, map) + + # evaluate + eval(remapped, envir = baseenv()) + + } + + renv_json_read_remap <- function(json, map) { + + # fix names + if (!is.null(names(json))) { + lhs <- match(names(json), names(map), nomatch = 0L) + rhs <- match(names(map), names(json), nomatch = 0L) + names(json)[rhs] <- map[lhs] + } + + # fix values + if (is.character(json)) + return(map[[json]] %||% json) + + # handle true, false, null + if (is.name(json)) { + text <- as.character(json) + if (text == "true") + return(TRUE) + else if (text == "false") + return(FALSE) + else if (text == "null") + return(NULL) + } + + # recurse + if (is.recursive(json)) { + for (i in seq_along(json)) { + json[i] <- list(renv_json_read_remap(json[[i]], map)) + } + } + + json + + } + + # load the renv profile, if any + renv_bootstrap_profile_load(project) + + # construct path to library root + root <- renv_bootstrap_library_root(project) + + # construct library prefix for platform + prefix <- renv_bootstrap_platform_prefix() + + # construct full libpath + libpath <- file.path(root, prefix) + + # run bootstrap code + renv_bootstrap_exec(project, libpath, version) + + invisible() + +}) diff --git a/notebooks/renv/settings.json b/notebooks/renv/settings.json new file mode 100644 index 000000000..ffdbb3200 --- /dev/null +++ b/notebooks/renv/settings.json @@ -0,0 +1,19 @@ +{ + "bioconductor.version": null, + "external.libraries": [], + "ignored.packages": [], + "package.dependency.fields": [ + "Imports", + "Depends", + "LinkingTo" + ], + "ppm.enabled": null, + "ppm.ignored.urls": [], + "r.version": null, + "snapshot.type": "implicit", + "use.cache": true, + "vcs.ignore.cellar": true, + "vcs.ignore.library": true, + "vcs.ignore.local": true, + "vcs.manage.ignores": true +} diff --git a/_template_python/.pylintrc b/nssp/.pylintrc similarity index 100% rename from _template_python/.pylintrc rename to nssp/.pylintrc diff --git a/nssp/DETAILS.md b/nssp/DETAILS.md new file mode 100644 index 000000000..539697baa --- /dev/null +++ b/nssp/DETAILS.md @@ -0,0 +1,13 @@ +# NSSP data + +We import the NSSP Emergency Department Visit data, including percentage and smoothed percentage of ER visits attributable to a given pathogen, from the CDC website. The data is provided at the county level, state level and national level; we do a population-weighted mean to aggregate from county data up to the HRR and MSA levels. + +## Geographical Levels +* `state`: reported using two-letter postal code +* `county`: reported using fips code +* `national`: just `us` for now +## Metrics +* `percent_visits_covid`, `percent_visits_rsv`, `percent_visits_influenza`: percentage of emergency department patient visits for specified pathogen. +* `percent_visits_combined`: sum of the three percentages of visits for flu, rsv and covid. +* `smoothed_percent_visits_covid`, `smoothed_percent_visits_rsv`, `smoothed_percent_visits_influenza`: 3 week moving average of the percentage of emergency department patient visits for specified pathogen. +* `smoothed_percent_visits_combined`: 3 week moving average of the sum of the three percentages of visits for flu, rsv and covid. \ No newline at end of file diff --git a/nssp/Makefile b/nssp/Makefile new file mode 100644 index 000000000..390113eef --- /dev/null +++ b/nssp/Makefile @@ -0,0 +1,32 @@ +.PHONY = venv, lint, test, clean + +dir = $(shell find ./delphi_* -name __init__.py | grep -o 'delphi_[_[:alnum:]]*' | head -1) +venv: + python3.8 -m venv env + +install: venv + . env/bin/activate; \ + pip install wheel ; \ + pip install -e ../_delphi_utils_python ;\ + pip install -e . + +install-ci: venv + . env/bin/activate; \ + pip install wheel ; \ + pip install ../_delphi_utils_python ;\ + pip install . + +lint: + . env/bin/activate; pylint $(dir) --rcfile=../pyproject.toml + . env/bin/activate; pydocstyle $(dir) + +format: + . env/bin/activate; darker $(dir) + +test: + . env/bin/activate ;\ + (cd tests && ../env/bin/pytest --cov=$(dir) --cov-report=term-missing) + +clean: + rm -rf env + rm -f params.json diff --git a/nssp/README.md b/nssp/README.md new file mode 100644 index 000000000..4bba6f626 --- /dev/null +++ b/nssp/README.md @@ -0,0 +1,75 @@ +# NSSP Emergency Department Visit data + +We import the NSSP Emergency Department Visit data, currently only the smoothed concentration, from the CDC website, aggregate to the state and national level from the wastewater sample site level, and export the aggregated data. +For details see the `DETAILS.md` file in this directory. + +## Create a MyAppToken +`MyAppToken` is required when fetching data from SODA Consumer API +(https://dev.socrata.com/foundry/data.cdc.gov/r8kw-7aab). Follow the +steps below to create a MyAppToken. +- Click the `Sign up for an app token` button in the linked website +- Sign In or Sign Up with Socrata ID +- Click the `Create New App Token` button +- Fill in `Application Name` and `Description` (You can just use delphi_wastewater + for both) and click `Save` +- Copy the `App Token` + + +## Running the Indicator + +The indicator is run by directly executing the Python module contained in this +directory. The safest way to do this is to create a virtual environment, +installed the common DELPHI tools, and then install the module and its +dependencies. To do this, run the following command from this directory: + +``` +make install +``` + +This command will install the package in editable mode, so you can make changes that +will automatically propagate to the installed package. + +All of the user-changable parameters are stored in `params.json`. To execute +the module and produce the output datasets (by default, in `receiving`), run +the following: + +``` +env/bin/python -m delphi_nssp +``` + +If you want to enter the virtual environment in your shell, +you can run `source env/bin/activate`. Run `deactivate` to leave the virtual environment. + +Once you are finished, you can remove the virtual environment and +params file with the following: + +``` +make clean +``` + +## Testing the code + +To run static tests of the code style, run the following command: + +``` +make lint +``` + +Unit tests are also included in the module. To execute these, run the following +command from this directory: + +``` +make test +``` + +To run individual tests, run the following: + +``` +(cd tests && ../env/bin/pytest .py --cov=delphi_NAME --cov-report=term-missing) +``` + +The output will show the number of unit tests that passed and failed, along +with the percentage of code covered by the tests. + +None of the linting or unit tests should fail, and the code lines that are not covered by unit tests should be small and +should not include critical sub-routines. diff --git a/nssp/REVIEW.md b/nssp/REVIEW.md new file mode 100644 index 000000000..03f87b17a --- /dev/null +++ b/nssp/REVIEW.md @@ -0,0 +1,38 @@ +## Code Review (Python) + +A code review of this module should include a careful look at the code and the +output. To assist in the process, but certainly not in replace of it, please +check the following items. + +**Documentation** + +- [ ] the README.md file template is filled out and currently accurate; it is +possible to load and test the code using only the instructions given +- [ ] minimal docstrings (one line describing what the function does) are +included for all functions; full docstrings describing the inputs and expected +outputs should be given for non-trivial functions + +**Structure** + +- [ ] code should pass lint checks (`make lint`) +- [ ] any required metadata files are checked into the repository and placed +within the directory `static` +- [ ] any intermediate files that are created and stored by the module should +be placed in the directory `cache` +- [ ] final expected output files to be uploaded to the API are placed in the +`receiving` directory; output files should not be committed to the respository +- [ ] all options and API keys are passed through the file `params.json` +- [ ] template parameter file (`params.json.template`) is checked into the +code; no personal (i.e., usernames) or private (i.e., API keys) information is +included in this template file + +**Testing** + +- [ ] module can be installed in a new virtual environment (`make install`) +- [ ] reasonably high level of unit test coverage covering all of the main logic +of the code (e.g., missing coverage for raised errors that do not currently seem +possible to reach are okay; missing coverage for options that will be needed are +not) +- [ ] all unit tests run without errors (`make test`) +- [ ] indicator directory has been added to GitHub CI +(`covidcast-indicators/.github/workflows/python-ci.yml`) diff --git a/nssp/cache/.gitignore b/nssp/cache/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/nssp/delphi_nssp/__init__.py b/nssp/delphi_nssp/__init__.py new file mode 100644 index 000000000..827935a53 --- /dev/null +++ b/nssp/delphi_nssp/__init__.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +"""Module to pull and clean indicators from the NSSP source. + +This file defines the functions that are made public by the module. As the +module is intended to be executed though the main method, these are primarily +for testing. +""" + +from __future__ import absolute_import + +from . import pull, run + +__version__ = "0.1.0" diff --git a/nssp/delphi_nssp/__main__.py b/nssp/delphi_nssp/__main__.py new file mode 100644 index 000000000..105f8e2d2 --- /dev/null +++ b/nssp/delphi_nssp/__main__.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +"""Call the function run_module when executed. + +This file indicates that calling the module (`python -m delphi_nssp`) will +call the function `run_module` found within the run.py file. There should be +no need to change this template. +""" + +from delphi_utils import read_params + +from .run import run_module # pragma: no cover + +run_module(read_params()) # pragma: no cover diff --git a/nssp/delphi_nssp/constants.py b/nssp/delphi_nssp/constants.py new file mode 100644 index 000000000..ddd2e74b8 --- /dev/null +++ b/nssp/delphi_nssp/constants.py @@ -0,0 +1,42 @@ +"""Registry for variations.""" + +GEOS = [ + "hrr", + "msa", + "nation", + "state", + "county", +] + +SIGNALS_MAP = { + "percent_visits_covid": "pct_ed_visits_covid", + "percent_visits_influenza": "pct_ed_visits_influenza", + "percent_visits_rsv": "pct_ed_visits_rsv", + "percent_visits_combined": "pct_ed_visits_combined", + "percent_visits_smoothed_covid": "smoothed_pct_ed_visits_covid", + "percent_visits_smoothed_1": "smoothed_pct_ed_visits_influenza", + "percent_visits_smoothed_rsv": "smoothed_pct_ed_visits_rsv", + "percent_visits_smoothed": "smoothed_pct_ed_visits_combined", +} + +SIGNALS = [val for (key, val) in SIGNALS_MAP.items()] +NEWLINE = "\n" + +AUXILIARY_COLS = [ + "se", + "sample_size", + "missing_val", + "missing_se", + "missing_sample_size", +] +CSV_COLS = ["geo_id", "val"] + AUXILIARY_COLS + +TYPE_DICT = {key: float for key in SIGNALS} +TYPE_DICT.update( + { + "timestamp": "datetime64[ns]", + "geography": str, + "county": str, + "fips": str, + } +) diff --git a/nssp/delphi_nssp/pull.py b/nssp/delphi_nssp/pull.py new file mode 100644 index 000000000..ece94fab4 --- /dev/null +++ b/nssp/delphi_nssp/pull.py @@ -0,0 +1,74 @@ +# -*- coding: utf-8 -*- +"""Functions for pulling NSSP ER data.""" + +import textwrap + +import pandas as pd +from sodapy import Socrata + +from .constants import NEWLINE, SIGNALS, SIGNALS_MAP, TYPE_DICT + + +def warn_string(df, type_dict): + """Format the warning string.""" + warn = textwrap.dedent( + f""" + Expected column(s) missed, The dataset schema may + have changed. Please investigate and amend the code. + + Columns needed: + {NEWLINE.join(sorted(type_dict.keys()))} + + Columns available: + {NEWLINE.join(sorted(df.columns))} + """ + ) + + return warn + + +def pull_nssp_data(socrata_token: str): + """Pull the latest NSSP ER visits data, and conforms it into a dataset. + + The output dataset has: + + - Each row corresponds to a single observation + - Each row additionally has columns for the signals in SIGNALS + + Parameters + ---------- + socrata_token: str + My App Token for pulling the NWSS data (could be the same as the nchs data) + test_file: Optional[str] + When not null, name of file from which to read test data + + Returns + ------- + pd.DataFrame + Dataframe as described above. + """ + # Pull data from Socrata API + client = Socrata("data.cdc.gov", socrata_token) + results = [] + offset = 0 + limit = 50000 # maximum limit allowed by SODA 2.0 + while True: + page = client.get("rdmq-nq56", limit=limit, offset=offset) + if not page: + break # exit the loop if no more results + results.extend(page) + offset += limit + df_ervisits = pd.DataFrame.from_records(results) + df_ervisits = df_ervisits.rename(columns={"week_end": "timestamp"}) + df_ervisits = df_ervisits.rename(columns=SIGNALS_MAP) + + try: + df_ervisits = df_ervisits.astype(TYPE_DICT) + except KeyError as exc: + raise ValueError(warn_string(df_ervisits, TYPE_DICT)) from exc + + # Format county fips to all be 5 digits with leading zeros + df_ervisits["fips"] = df_ervisits["fips"].apply(lambda x: str(x).zfill(5) if str(x) != "0" else "0") + + keep_columns = ["timestamp", "geography", "county", "fips"] + return df_ervisits[SIGNALS + keep_columns] diff --git a/nssp/delphi_nssp/run.py b/nssp/delphi_nssp/run.py new file mode 100644 index 000000000..7c5a3ffac --- /dev/null +++ b/nssp/delphi_nssp/run.py @@ -0,0 +1,133 @@ +# -*- coding: utf-8 -*- +"""Functions to call when running the function. + +This module should contain a function called `run_module`, that is executed +when the module is run with `python -m delphi_nssp`. `run_module`'s lone argument should be a +nested dictionary of parameters loaded from the params.json file. We expect the `params` to have +the following structure: + - "common": + - "export_dir": str, directory to write daily output + - "log_filename": (optional) str, path to log file + - "log_exceptions" (optional): bool, whether to log exceptions to file + - "indicator": (optional) + - "wip_signal": (optional) Any[str, bool], list of signals that are + works in progress, or True if all signals in the registry are works + in progress, or False if only unpublished signals are. See + `delphi_utils.add_prefix()` + - "test_file" (optional): str, name of file from which to read test data + - "socrata_token": str, authentication for upstream data pull + - "archive" (optional): if provided, output will be archived with S3 + - "aws_credentials": Dict[str, str], AWS login credentials (see S3 documentation) + - "bucket_name: str, name of S3 bucket to read/write + - "cache_dir": str, directory of locally cached data +""" + +import time +from datetime import datetime + +import numpy as np +import us +from delphi_utils import create_export_csv, get_structured_logger +from delphi_utils.geomap import GeoMapper +from delphi_utils.nancodes import add_default_nancodes + +from .constants import AUXILIARY_COLS, CSV_COLS, GEOS, SIGNALS +from .pull import pull_nssp_data + + +def add_needed_columns(df, col_names=None): + """Short util to add expected columns not found in the dataset.""" + if col_names is None: + col_names = AUXILIARY_COLS + + for col_name in col_names: + df[col_name] = np.nan + df = add_default_nancodes(df) + return df + + +def logging(start_time, run_stats, logger): + """Boilerplate making logs.""" + elapsed_time_in_seconds = round(time.time() - start_time, 2) + min_max_date = run_stats and min(s[0] for s in run_stats) + csv_export_count = sum(s[-1] for s in run_stats) + max_lag_in_days = min_max_date and (datetime.now() - min_max_date).days + formatted_min_max_date = min_max_date and min_max_date.strftime("%Y-%m-%d") + logger.info( + "Completed indicator run", + elapsed_time_in_seconds=elapsed_time_in_seconds, + csv_export_count=csv_export_count, + max_lag_in_days=max_lag_in_days, + oldest_final_export_date=formatted_min_max_date, + ) + + +def run_module(params): + """ + Run the indicator. + + Arguments + -------- + params: Dict[str, Any] + Nested dictionary of parameters. + """ + start_time = time.time() + logger = get_structured_logger( + __name__, + filename=params["common"].get("log_filename"), + log_exceptions=params["common"].get("log_exceptions", True), + ) + export_dir = params["common"]["export_dir"] + socrata_token = params["indicator"]["socrata_token"] + + run_stats = [] + ## build the base version of the signal at the most detailed geo level you can get. + ## compute stuff here or farm out to another function or file + df_pull = pull_nssp_data(socrata_token) + ## aggregate + geo_mapper = GeoMapper() + for signal in SIGNALS: + for geo in GEOS: + df = df_pull.copy() + df["val"] = df[signal] + logger.info("Generating signal and exporting to CSV", metric=signal) + if geo == "nation": + df = df[df["geography"] == "United States"] + df["geo_id"] = "us" + elif geo == "state": + df = df[(df["county"] == "All") & (df["geography"] != "United States")] + df["geo_id"] = df["geography"].apply( + lambda x: us.states.lookup(x).abbr.lower() if us.states.lookup(x) else "dc" + ) + elif geo == "hrr": + df = df[["fips", "val", "timestamp"]] + # fips -> hrr has a weighted version + df = geo_mapper.replace_geocode(df, "fips", "hrr") + df = df.rename(columns={"hrr": "geo_id"}) + elif geo == "msa": + df = df[["fips", "val", "timestamp"]] + # fips -> msa doesn't have a weighted version, so we need to add columns and sum ourselves + df = geo_mapper.add_population_column(df, geocode_type="fips", geocode_col="fips") + df = geo_mapper.add_geocode(df, "fips", "msa", from_col="fips", new_col="geo_id") + df = geo_mapper.aggregate_by_weighted_sum(df, "geo_id", "val", "timestamp", "population") + df = df.rename(columns={"weighted_val": "val"}) + else: + df = df[df["county"] != "All"] + df["geo_id"] = df["fips"] + # add se, sample_size, and na codes + missing_cols = set(CSV_COLS) - set(df.columns) + df = add_needed_columns(df, col_names=list(missing_cols)) + df_csv = df[CSV_COLS + ["timestamp"]] + # actual export + dates = create_export_csv( + df_csv, + geo_res=geo, + export_dir=export_dir, + sensor=signal, + weekly_dates=True, + ) + if len(dates) > 0: + run_stats.append((max(dates), len(dates))) + + ## log this indicator run + logging(start_time, run_stats, logger) diff --git a/nssp/params.json.template b/nssp/params.json.template new file mode 100644 index 000000000..df989ede7 --- /dev/null +++ b/nssp/params.json.template @@ -0,0 +1,30 @@ +{ + "common": { + "export_dir": "./receiving", + "log_filename": "./nssp.log", + "log_exceptions": false + }, + "indicator": { + "wip_signal": true, + "static_file_dir": "./static", + "socrata_token": "" + }, + "validation": { + "common": { + "data_source": "nssp", + "api_credentials": "{{ validation_api_key }}", + "span_length": 15, + "min_expected_lag": {"all": "7"}, + "max_expected_lag": {"all": "13"}, + "dry_run": true, + "suppressed_errors": [] + }, + "static": { + "minimum_sample_size": 0, + "missing_se_allowed": true, + "missing_sample_size_allowed": true + }, + "dynamic": {} + } +} + diff --git a/nssp/receiving/.gitignore b/nssp/receiving/.gitignore new file mode 100644 index 000000000..afed0735d --- /dev/null +++ b/nssp/receiving/.gitignore @@ -0,0 +1 @@ +*.csv diff --git a/nssp/setup.py b/nssp/setup.py new file mode 100644 index 000000000..a6cbf640a --- /dev/null +++ b/nssp/setup.py @@ -0,0 +1,32 @@ +from setuptools import setup +from setuptools import find_packages + +required = [ + "numpy", + "pandas", + "pydocstyle", + "pytest", + "pytest-cov", + "pylint==2.8.3", + "delphi-utils", + "sodapy", + "epiweeks", + "freezegun", + "us", +] + +setup( + name="delphi_nssp", + version="0.1.0", + description="Indicators NSSP Emergency Department Visit", + author="Minh Le", + author_email="minhkhul@andrew.cmu.edu", + url="https://github.com/cmu-delphi/covidcast-indicators", + install_requires=required, + classifiers=[ + "Development Status :: 1 - Planning", + "Intended Audience :: Developers", + "Programming Language :: Python :: 3.8", + ], + packages=find_packages(), +) diff --git a/nssp/tests/test_data/page.txt b/nssp/tests/test_data/page.txt new file mode 100644 index 000000000..34dfa71a8 --- /dev/null +++ b/nssp/tests/test_data/page.txt @@ -0,0 +1,86 @@ +[ + { + "week_end": "2022-10-01T00:00:00.000", + "geography": "United States", + "county": "All", + "percent_visits_combined": "2.84", + "percent_visits_covid": "1.84", + "percent_visits_influenza": "0.48", + "percent_visits_rsv": "0.55", + "percent_visits_smoothed": "2.83", + "percent_visits_smoothed_covid": "2.07", + "percent_visits_smoothed_1": "0.34", + "percent_visits_smoothed_rsv": "0.44", + "ed_trends_covid": "Decreasing", + "ed_trends_influenza": "Increasing", + "ed_trends_rsv": "Increasing", + "hsa": "All", + "hsa_counties": "All", + "hsa_nci_id": "All", + "fips": "0", + "trend_source": "United States" + }, + { + "week_end": "2022-10-08T00:00:00.000", + "geography": "United States", + "county": "All", + "percent_visits_combined": "2.93", + "percent_visits_covid": "1.68", + "percent_visits_influenza": "0.68", + "percent_visits_rsv": "0.6", + "percent_visits_smoothed": "2.85", + "percent_visits_smoothed_covid": "1.85", + "percent_visits_smoothed_1": "0.49", + "percent_visits_smoothed_rsv": "0.53", + "ed_trends_covid": "Decreasing", + "ed_trends_influenza": "Increasing", + "ed_trends_rsv": "Increasing", + "hsa": "All", + "hsa_counties": "All", + "hsa_nci_id": "All", + "fips": "0", + "trend_source": "United States" + }, + { + "week_end": "2022-10-15T00:00:00.000", + "geography": "United States", + "county": "All", + "percent_visits_combined": "3.25", + "percent_visits_covid": "1.64", + "percent_visits_influenza": "0.9", + "percent_visits_rsv": "0.74", + "percent_visits_smoothed": "3.01", + "percent_visits_smoothed_covid": "1.72", + "percent_visits_smoothed_1": "0.69", + "percent_visits_smoothed_rsv": "0.63", + "ed_trends_covid": "Decreasing", + "ed_trends_influenza": "Increasing", + "ed_trends_rsv": "Increasing", + "hsa": "All", + "hsa_counties": "All", + "hsa_nci_id": "All", + "fips": "0", + "trend_source": "United States" + }, + { + "week_end": "2023-05-13T00:00:00.000", + "geography": "Colorado", + "county": "Jefferson", + "percent_visits_combined": "0.84", + "percent_visits_covid": "0.59", + "percent_visits_influenza": "0.23", + "percent_visits_rsv": "0.03", + "percent_visits_smoothed": "0.83", + "percent_visits_smoothed_covid": "0.62", + "percent_visits_smoothed_1": "0.18", + "percent_visits_smoothed_rsv": "0.02", + "ed_trends_covid": "Decreasing", + "ed_trends_influenza": "No Change", + "ed_trends_rsv": "Decreasing", + "hsa": "Denver (Denver), CO - Jefferson, CO", + "hsa_counties": "Adams, Arapahoe, Clear Creek, Denver, Douglas, Elbert, Gilpin, Grand, Jefferson, Park, Summit", + "hsa_nci_id": "688", + "fips": "8059", + "trend_source": "HSA" + } +] diff --git a/nssp/tests/test_pull.py b/nssp/tests/test_pull.py new file mode 100644 index 000000000..b356341f6 --- /dev/null +++ b/nssp/tests/test_pull.py @@ -0,0 +1,60 @@ +from datetime import datetime, date +import json +import unittest +from unittest.mock import patch, MagicMock +import tempfile +import os +import time +from datetime import datetime +import pdb +import pandas as pd +import pandas.api.types as ptypes + +from delphi_nssp.pull import ( + pull_nssp_data, +) +from delphi_nssp.constants import ( + SIGNALS, + NEWLINE, + SIGNALS_MAP, + TYPE_DICT, +) + + +class TestPullNSSPData(unittest.TestCase): + @patch("delphi_nssp.pull.Socrata") + def test_pull_nssp_data(self, mock_socrata): + # Load test data + with open("test_data/page.txt", "r") as f: + test_data = json.load(f) + + # Mock Socrata client and its get method + mock_client = MagicMock() + mock_client.get.side_effect = [test_data, []] # Return test data on first call, empty list on second call + mock_socrata.return_value = mock_client + + # Call function with test token + test_token = "test_token" + result = pull_nssp_data(test_token) + print(result) + + # Check that Socrata client was initialized with correct arguments + mock_socrata.assert_called_once_with("data.cdc.gov", test_token) + + # Check that get method was called with correct arguments + mock_client.get.assert_any_call("rdmq-nq56", limit=50000, offset=0) + + # Check result + assert result["timestamp"].notnull().all(), "timestamp has rogue NaN" + assert result["geography"].notnull().all(), "geography has rogue NaN" + assert result["county"].notnull().all(), "county has rogue NaN" + assert result["fips"].notnull().all(), "fips has rogue NaN" + assert result["fips"].apply(lambda x: isinstance(x, str) and len(x) != 4).all(), "fips formatting should always be 5 digits; include leading zeros if aplicable" + + # Check for each signal in SIGNALS + for signal in SIGNALS: + assert result[signal].notnull().all(), f"{signal} has rogue NaN" + + +if __name__ == "__main__": + unittest.main() diff --git a/nssp/tests/test_run.py b/nssp/tests/test_run.py new file mode 100644 index 000000000..72346cff7 --- /dev/null +++ b/nssp/tests/test_run.py @@ -0,0 +1,31 @@ +from datetime import datetime, date +import json +from unittest.mock import patch +import tempfile +import os +import time +from datetime import datetime + +import numpy as np +import pandas as pd +from pandas.testing import assert_frame_equal +from delphi_nssp.constants import GEOS, SIGNALS, CSV_COLS +from delphi_nssp.run import ( + add_needed_columns +) + + +def test_add_needed_columns(): + df = pd.DataFrame({"geo_id": ["us"], "val": [1]}) + df = add_needed_columns(df, col_names=None) + assert df.columns.tolist() == [ + "geo_id", + "val", + "se", + "sample_size", + "missing_val", + "missing_se", + "missing_sample_size", + ] + assert df["se"].isnull().all() + assert df["sample_size"].isnull().all() diff --git a/nwss_wastewater/.pylintrc b/nwss_wastewater/.pylintrc deleted file mode 100644 index f30837c7e..000000000 --- a/nwss_wastewater/.pylintrc +++ /dev/null @@ -1,22 +0,0 @@ - -[MESSAGES CONTROL] - -disable=logging-format-interpolation, - too-many-locals, - too-many-arguments, - # Allow pytest functions to be part of a class. - no-self-use, - # Allow pytest classes to have one test. - too-few-public-methods - -[BASIC] - -# Allow arbitrarily short-named variables. -variable-rgx=[a-z_][a-z0-9_]* -argument-rgx=[a-z_][a-z0-9_]* -attr-rgx=[a-z_][a-z0-9_]* - -[DESIGN] - -# Don't complain about pytest "unused" arguments. -ignored-argument-names=(_.*|run_as_module) \ No newline at end of file diff --git a/nwss_wastewater/Makefile b/nwss_wastewater/Makefile index bc88f1fec..390113eef 100644 --- a/nwss_wastewater/Makefile +++ b/nwss_wastewater/Makefile @@ -17,9 +17,12 @@ install-ci: venv pip install . lint: - . env/bin/activate; pylint $(dir) + . env/bin/activate; pylint $(dir) --rcfile=../pyproject.toml . env/bin/activate; pydocstyle $(dir) +format: + . env/bin/activate; darker $(dir) + test: . env/bin/activate ;\ (cd tests && ../env/bin/pytest --cov=$(dir) --cov-report=term-missing) diff --git a/nwss_wastewater/setup.py b/nwss_wastewater/setup.py index f2cce8cb3..26f1b7324 100644 --- a/nwss_wastewater/setup.py +++ b/nwss_wastewater/setup.py @@ -2,16 +2,17 @@ from setuptools import find_packages required = [ + "darker[isort]~=2.1.1", + "delphi-utils", + "epiweeks", + "freezegun", "numpy", "pandas", "pydocstyle", - "pytest", - "pytest-cov", "pylint==2.8.3", - "delphi-utils", + "pytest-cov", + "pytest", "sodapy", - "epiweeks", - "freezegun", ] setup( diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..e194a54b7 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,41 @@ +[tool.black] +line-length = 120 +target-version = ['py38'] + +[tool.ruff] +line-length = 120 +target-version = 'py38' + +[tool.darker] +revision = 'origin/main...' +color = true +isort = true + +[tool.isort] +profile = "black" +known_third_party = ["pytest"] + +[tool.pylint] +[tool.pylint.main] +max-line-length = 120 +disable = [ + 'logging-format-interpolation', + # Allow pytest functions to be part of a class + 'no-self-use', + 'too-many-locals', + 'too-many-arguments', + 'too-many-branches', + 'too-many-statements', + # Allow pytest classes to have one test + 'too-few-public-methods', +] +enable = 'useless-suppression' + +[tool.pylint.basic] +# Allow arbitrarily short-named variables. +variable-rgx = '[A-Za-z_][a-z0-9_]*' +argument-rgx = '[A-Za-z_][a-z0-9_]*' +attr-rgx = '[A-Za-z_][a-z0-9_]*' + +[tool.pylint.design] +ignored-argument-names = ['(_.*|run_as_module)'] diff --git a/quidel_covidtest/.pylintrc b/quidel_covidtest/.pylintrc deleted file mode 100644 index 29bd9aac2..000000000 --- a/quidel_covidtest/.pylintrc +++ /dev/null @@ -1,24 +0,0 @@ - -[MESSAGES CONTROL] - -disable=logging-format-interpolation, - too-many-locals, - too-many-arguments, - too-many-branches, - # Allow pytest functions to be part of a class. - no-self-use, - # Allow pytest classes to have one test. - too-few-public-methods -enable=useless-suppression - -[BASIC] - -# Allow arbitrarily short-named variables. -variable-rgx=[a-z_][a-z0-9_]* -argument-rgx=[a-z_][a-z0-9_]* -attr-rgx=[a-z_][a-z0-9_]* - -[DESIGN] - -# Don't complain about pytest "unused" arguments. -ignored-argument-names=(_.*|run_as_module) diff --git a/quidel_covidtest/Makefile b/quidel_covidtest/Makefile index bc88f1fec..390113eef 100644 --- a/quidel_covidtest/Makefile +++ b/quidel_covidtest/Makefile @@ -17,9 +17,12 @@ install-ci: venv pip install . lint: - . env/bin/activate; pylint $(dir) + . env/bin/activate; pylint $(dir) --rcfile=../pyproject.toml . env/bin/activate; pydocstyle $(dir) +format: + . env/bin/activate; darker $(dir) + test: . env/bin/activate ;\ (cd tests && ../env/bin/pytest --cov=$(dir) --cov-report=term-missing) diff --git a/quidel_covidtest/setup.py b/quidel_covidtest/setup.py index 369ac30c0..c2791930f 100644 --- a/quidel_covidtest/setup.py +++ b/quidel_covidtest/setup.py @@ -2,18 +2,19 @@ from setuptools import find_packages required = [ + "covidcast", + "darker[isort]~=2.1.1", + "delphi-utils", + "imap-tools", "numpy", + "openpyxl", "pandas", "pyarrow", "pydocstyle", - "pytest", - "pytest-cov", "pylint==2.8.3", - "delphi-utils", - "imap-tools", + "pytest-cov", + "pytest", "xlrd==1.2.0", - "covidcast", - "openpyxl" ] setup( diff --git a/quidel_covidtest/version.cfg b/quidel_covidtest/version.cfg index d3d61ed12..f5c28d2cd 100644 --- a/quidel_covidtest/version.cfg +++ b/quidel_covidtest/version.cfg @@ -1 +1 @@ -current_version = 0.3.54 +current_version = 0.3.55 diff --git a/sir_complainsalot/Makefile b/sir_complainsalot/Makefile index bc88f1fec..390113eef 100644 --- a/sir_complainsalot/Makefile +++ b/sir_complainsalot/Makefile @@ -17,9 +17,12 @@ install-ci: venv pip install . lint: - . env/bin/activate; pylint $(dir) + . env/bin/activate; pylint $(dir) --rcfile=../pyproject.toml . env/bin/activate; pydocstyle $(dir) +format: + . env/bin/activate; darker $(dir) + test: . env/bin/activate ;\ (cd tests && ../env/bin/pytest --cov=$(dir) --cov-report=term-missing) diff --git a/sir_complainsalot/delphi_sir_complainsalot/run.py b/sir_complainsalot/delphi_sir_complainsalot/run.py index 962fa1bc3..a1555b9c2 100644 --- a/sir_complainsalot/delphi_sir_complainsalot/run.py +++ b/sir_complainsalot/delphi_sir_complainsalot/run.py @@ -48,7 +48,7 @@ def run_module(): elapsed_time_in_seconds = elapsed_time_in_seconds) -def split_complaints(complaints, n=49): # pylint: disable=invalid-name +def split_complaints(complaints, n=49): """Yield successive n-sized chunks from complaints list.""" for i in range(0, len(complaints), n): yield complaints[i:i + n] diff --git a/sir_complainsalot/params.json.template b/sir_complainsalot/params.json.template index b07b197a4..6b1bf870b 100644 --- a/sir_complainsalot/params.json.template +++ b/sir_complainsalot/params.json.template @@ -12,11 +12,6 @@ "maintainers": ["U01AP8GSWG3","U01069KCRS7"], "retired-signals": ["smoothed_covid19","smoothed_adj_covid19"] }, - "chng": { - "max_age": 6, - "maintainers": ["U01AP8GSWG3","U01069KCRS7"], - "retired-signals": ["7dav_outpatient_covid","7dav_inpatient_covid"] - }, "google-symptoms": { "max_age": 6, "maintainers": ["U01AP8GSWG3","U01069KCRS7"], @@ -47,8 +42,8 @@ "max_age":19, "maintainers": [] }, - "hhs": { - "max_age":15, + "nssp": { + "max_age":13, "maintainers": [] } } diff --git a/sir_complainsalot/setup.py b/sir_complainsalot/setup.py index c51253104..157c001b2 100644 --- a/sir_complainsalot/setup.py +++ b/sir_complainsalot/setup.py @@ -2,13 +2,14 @@ from setuptools import find_packages required = [ + "covidcast", + "darker[isort]~=2.1.1", + "delphi-utils", "pandas", - "pytest", - "pytest-cov", "pylint==2.8.3", - "delphi-utils", + "pytest-cov", + "pytest", "slackclient", - "covidcast" ] setup( diff --git a/sir_complainsalot/version.cfg b/sir_complainsalot/version.cfg index d3d61ed12..f5c28d2cd 100644 --- a/sir_complainsalot/version.cfg +++ b/sir_complainsalot/version.cfg @@ -1 +1 @@ -current_version = 0.3.54 +current_version = 0.3.55