Skip to content

Commit 517df70

Browse files
authored
Merge pull request #1064 from cmu-delphi/krivard/covid_hosp_datechecks
Fix problem with covid_hosp skipping state revisions.
2 parents 7643299 + 247a1e3 commit 517df70

File tree

9 files changed

+39
-18
lines changed

9 files changed

+39
-18
lines changed

src/acquisition/covid_hosp/common/database.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ class Database:
1919
def __init__(self,
2020
connection,
2121
table_name=None,
22+
hhs_dataset_id=None,
2223
columns_and_types=None,
2324
key_columns=None,
2425
additional_fields=None):
@@ -30,6 +31,8 @@ def __init__(self,
3031
An open connection to a database.
3132
table_name : str
3233
The name of the table which holds the dataset.
34+
hhs_dataset_id : str
35+
The 9-character healthdata.gov identifier for this dataset.
3336
columns_and_types : tuple[str, str, Callable]
3437
List of 3-tuples of (CSV header name, SQL column name, data type) for
3538
all the columns in the CSV file.
@@ -40,6 +43,7 @@ def __init__(self,
4043

4144
self.connection = connection
4245
self.table_name = table_name
46+
self.hhs_dataset_id = hhs_dataset_id
4347
self.publication_col_name = "issue" if table_name == 'covid_hosp_state_timeseries' else \
4448
'publication_date'
4549
self.columns_and_types = {
@@ -115,8 +119,8 @@ def contains_revision(self, revision):
115119
FROM
116120
`covid_hosp_meta`
117121
WHERE
118-
`dataset_name` = %s AND `revision_timestamp` = %s
119-
''', (self.table_name, revision))
122+
`hhs_dataset_id` = %s AND `revision_timestamp` = %s
123+
''', (self.hhs_dataset_id, revision))
120124
for (result,) in cursor:
121125
return bool(result)
122126

@@ -138,14 +142,15 @@ def insert_metadata(self, publication_date, revision, meta_json):
138142
INSERT INTO
139143
`covid_hosp_meta` (
140144
`dataset_name`,
145+
`hhs_dataset_id`,
141146
`publication_date`,
142147
`revision_timestamp`,
143148
`metadata_json`,
144149
`acquisition_datetime`
145150
)
146151
VALUES
147-
(%s, %s, %s, %s, NOW())
148-
''', (self.table_name, publication_date, revision, meta_json))
152+
(%s, %s, %s, %s, %s, NOW())
153+
''', (self.table_name, self.hhs_dataset_id, publication_date, revision, meta_json))
149154

150155
def insert_dataset(self, publication_date, dataframe):
151156
"""Add a dataset to the database.
@@ -232,7 +237,7 @@ def get_max_issue(self):
232237
from
233238
`covid_hosp_meta`
234239
WHERE
235-
dataset_name = "{self.table_name}"
240+
hhs_dataset_id = "{self.hhs_dataset_id}"
236241
''')
237242
for (result,) in cursor:
238243
if result is not None:

src/acquisition/covid_hosp/common/utils.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -169,19 +169,20 @@ def update_dataset(database, network, newer_than=None, older_than=None):
169169
# download the dataset and add it to the database
170170
dataset = Utils.merge_by_key_cols([network.fetch_dataset(url) for url, _ in revisions],
171171
db.KEY_COLS)
172-
# add metadata to the database using the last revision seen.
173-
last_url, last_index = revisions[-1]
174-
metadata_json = metadata.loc[last_index].reset_index().to_json()
172+
# add metadata to the database
173+
all_metadata = []
174+
for url, index in revisions:
175+
all_metadata.append((url, metadata.loc[index].reset_index().to_json()))
175176
datasets.append((
176177
issue_int,
177178
dataset,
178-
last_url,
179-
metadata_json
179+
all_metadata
180180
))
181181
with database.connect() as db:
182-
for issue_int, dataset, last_url, metadata_json in datasets:
182+
for issue_int, dataset, all_metadata in datasets:
183183
db.insert_dataset(issue_int, dataset)
184-
db.insert_metadata(issue_int, last_url, metadata_json)
184+
for url, metadata_json in all_metadata:
185+
db.insert_metadata(issue_int, url, metadata_json)
185186
print(f'successfully acquired {len(dataset)} rows')
186187

187188
# note that the transaction is committed by exiting the `with` block

src/acquisition/covid_hosp/facility/database.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from delphi.epidata.acquisition.covid_hosp.common.database import Database as BaseDatabase
33
from delphi.epidata.acquisition.covid_hosp.common.database import Columndef
44
from delphi.epidata.acquisition.covid_hosp.common.utils import Utils
5+
from delphi.epidata.acquisition.covid_hosp.facility.network import Network
56

67

78
class Database(BaseDatabase):
@@ -213,5 +214,6 @@ def __init__(self, *args, **kwargs):
213214
*args,
214215
**kwargs,
215216
table_name=Database.TABLE_NAME,
217+
hhs_dataset_id=Network.DATASET_ID,
216218
key_columns=Database.KEY_COLS,
217219
columns_and_types=Database.ORDERED_CSV_COLUMNS)

src/acquisition/covid_hosp/state_daily/database.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from delphi.epidata.acquisition.covid_hosp.common.database import Database as BaseDatabase
33
from delphi.epidata.acquisition.covid_hosp.common.database import Columndef
44
from delphi.epidata.acquisition.covid_hosp.common.utils import Utils
5+
from delphi.epidata.acquisition.covid_hosp.state_daily.network import Network
56

67

78
class Database(BaseDatabase):
@@ -223,6 +224,7 @@ def __init__(self, *args, **kwargs):
223224
*args,
224225
**kwargs,
225226
table_name=Database.TABLE_NAME,
227+
hhs_dataset_id=Network.DATASET_ID,
226228
columns_and_types=Database.ORDERED_CSV_COLUMNS,
227229
key_columns=Database.KEY_COLS,
228230
additional_fields=[Columndef('D', 'record_type', None)])

src/acquisition/covid_hosp/state_timeseries/database.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from delphi.epidata.acquisition.covid_hosp.common.database import Database as BaseDatabase
33
from delphi.epidata.acquisition.covid_hosp.common.database import Columndef
44
from delphi.epidata.acquisition.covid_hosp.common.utils import Utils
5+
from delphi.epidata.acquisition.covid_hosp.state_timeseries.network import Network
56

67

78
class Database(BaseDatabase):
@@ -222,6 +223,7 @@ def __init__(self, *args, **kwargs):
222223
*args,
223224
**kwargs,
224225
table_name=Database.TABLE_NAME,
226+
hhs_dataset_id=Network.DATASET_ID,
225227
columns_and_types=Database.ORDERED_CSV_COLUMNS,
226228
key_columns=Database.KEY_COLS,
227229
additional_fields=[Columndef('T', 'record_type', None)])

src/ddl/covid_hosp.sql

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ surfaced through the Epidata API.
4848
CREATE TABLE `covid_hosp_meta` (
4949
`id` INT NOT NULL AUTO_INCREMENT,
5050
`dataset_name` VARCHAR(64) NOT NULL,
51+
`hhs_dataset_id` CHAR(9) NOT NULL DEFAULT "????-????",
5152
`publication_date` INT NOT NULL,
5253
`revision_timestamp` VARCHAR(512) NOT NULL,
5354
`metadata_json` JSON NOT NULL,
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
ALTER TABLE covid_hosp_meta ADD COLUMN hhs_dataset_id CHAR(9) NOT NULL DEFAULT "????-????";
2+
UPDATE covid_hosp_meta SET hhs_dataset_id="g62h-syeh" WHERE revision_timestamp LIKE "%g62h-syeh%";
3+
UPDATE covid_hosp_meta SET hhs_dataset_id="6xf2-c3ie" WHERE revision_timestamp LIKE "%6xf2-c3ie%";
4+
UPDATE covid_hosp_meta SET hhs_dataset_id="anag-cw7u" WHERE revision_timestamp LIKE "%anag-cw7u%";

tests/acquisition/covid_hosp/common/test_database.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def test_contains_revision(self):
6868

6969
mock_connection = MagicMock()
7070
mock_cursor = mock_connection.cursor()
71-
database = Database(mock_connection, table_name=sentinel.table_name)
71+
database = Database(mock_connection, table_name=sentinel.table_name, hhs_dataset_id=sentinel.hhs_dataset_id)
7272

7373
with self.subTest(name='new revision'):
7474
mock_cursor.__iter__.return_value = [(0,)]
@@ -78,7 +78,7 @@ def test_contains_revision(self):
7878
# compare with boolean literal to test the type cast
7979
self.assertIs(result, False)
8080
query_values = mock_cursor.execute.call_args[0][-1]
81-
self.assertEqual(query_values, (sentinel.table_name, sentinel.revision))
81+
self.assertEqual(query_values, (sentinel.hhs_dataset_id, sentinel.revision))
8282

8383
with self.subTest(name='old revision'):
8484
mock_cursor.__iter__.return_value = [(1,)]
@@ -88,7 +88,7 @@ def test_contains_revision(self):
8888
# compare with boolean literal to test the type cast
8989
self.assertIs(result, True)
9090
query_values = mock_cursor.execute.call_args[0][-1]
91-
self.assertEqual(query_values, (sentinel.table_name, sentinel.revision))
91+
self.assertEqual(query_values, (sentinel.hhs_dataset_id, sentinel.revision))
9292

9393
def test_insert_metadata(self):
9494
"""Add new metadata to the database."""
@@ -98,7 +98,7 @@ def test_insert_metadata(self):
9898

9999
mock_connection = MagicMock()
100100
mock_cursor = mock_connection.cursor()
101-
database = Database(mock_connection, table_name=sentinel.dataset_name)
101+
database = Database(mock_connection, table_name=sentinel.table_name, hhs_dataset_id=sentinel.hhs_dataset_id)
102102

103103
result = database.insert_metadata(
104104
sentinel.publication_date,
@@ -108,7 +108,8 @@ def test_insert_metadata(self):
108108
self.assertIsNone(result)
109109
actual_values = mock_cursor.execute.call_args[0][-1]
110110
expected_values = (
111-
sentinel.dataset_name,
111+
sentinel.table_name,
112+
sentinel.hhs_dataset_id,
112113
sentinel.publication_date,
113114
sentinel.revision,
114115
sentinel.meta_json,

tests/acquisition/covid_hosp/common/test_utils.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,10 @@ def test_run_acquire_new_dataset(self):
129129

130130
self.assertTrue(result)
131131

132-
mock_connection.insert_metadata.assert_called_once()
132+
# should have been called twice
133+
mock_connection.insert_metadata.assert_called()
134+
assert mock_connection.insert_metadata.call_count == 2
135+
# most recent call should be for the final revision at url2
133136
args = mock_connection.insert_metadata.call_args[0]
134137
self.assertEqual(args[:2], (20210315, "url2"))
135138
pd.testing.assert_frame_equal(

0 commit comments

Comments
 (0)