Skip to content

Commit 02de2eb

Browse files
committed
add extra edge cases, and duplicate checking
1 parent 908df83 commit 02de2eb

File tree

4 files changed

+179
-31
lines changed

4 files changed

+179
-31
lines changed

src/acquisition/rvdss/pull_historic.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -342,13 +342,20 @@ def fix_edge_cases(table,season,caption,current_week):
342342
# In week 47 of the 2017-2018 season, a date is written as 201-11-25,
343343
# instead of 2017-11-25
344344
table.loc[table['week'] == 47, 'week end'] = "2017-11-25"
345+
elif current_week == 26 and "number" in caption.text.lower():
346+
# anomolous row with decimal counts that differs a lot from the next week.
347+
table = table[table.week != 26]
345348
elif season[0] == '2015' and current_week == 41:
346349
# In week 41 of the 2015-2016 season, a date written in m-d-y format not d-m-y
347350
table=table.replace("10-17-2015","17-10-2015",regex=True)
348351
elif season[0] == '2022' and current_week == 11 and "hmpv" in caption.text.lower():
349352
# In week 11 of the 2022-2023 season, in the positive hmpv table,
350353
# a date is written as 022-09-03, instead of 2022-09-03
351354
table.loc[table['week'] == 35, 'week end'] = "2022-09-03"
355+
elif season[0] == '2016' and current_week == 32 and "influenza" in caption.text.lower():
356+
# In week 11 of the 2022-2023 season, in the positive hmpv table,
357+
# a date is written as 022-09-03, instead of 2022-09-03
358+
table.loc[table['week'] == 32, 'week end'] = "2017-08-12"
352359
return(table)
353360

354361
def fetch_one_season_from_report(url):
@@ -389,6 +396,8 @@ def fetch_one_season_from_report(url):
389396

390397
positive_tables=[]
391398
number_table_exists = False
399+
respiratory_detection_table_exists = False
400+
positive_table_exists = False
392401
for i in range(len(captions)):
393402
caption=captions[i]
394403
tab = caption.find_next('table')
@@ -456,7 +465,8 @@ def fetch_one_season_from_report(url):
456465

457466
table = fix_edge_cases(table, season[0], caption, current_week)
458467

459-
# check if both ah1 and h1n1 are given. If so drop one since they are the same virus and ah1 is always empty
468+
# check if both ah1 and h1n1 are given. If so drop one since they are the same virus and ah1 is
469+
# always empty
460470
table = drop_ah1_columns(table)
461471

462472
# Rename columns
@@ -523,20 +533,18 @@ def fetch_one_season_from_report(url):
523533
# If not, add the weeks tables into the season table
524534

525535
# check for deduplication pandas
526-
all_respiratory_detection_tables= pd.concat([all_respiratory_detection_tables,respiratory_detection_table])
527-
all_positive_tables=pd.concat([all_positive_tables,combined_positive_tables])
528536
if not number_detections_table.index.isin(all_number_tables.index).any():
529537
all_number_tables=pd.concat([all_number_tables,number_detections_table])
530538

531-
# if not respiratory_detection_table.index.isin(all_respiratory_detection_tables.index).any():
532-
# all_respiratory_detection_tables= pd.concat([all_respiratory_detection_tables,respiratory_detection_table])
539+
if not respiratory_detection_table.index.isin(all_respiratory_detection_tables.index).any():
540+
all_respiratory_detection_tables= pd.concat([all_respiratory_detection_tables,respiratory_detection_table])
533541

534-
# if not combined_positive_tables.index.isin(all_positive_tables.index).any():
535-
# all_positive_tables=pd.concat([all_positive_tables,combined_positive_tables])
542+
if not combined_positive_tables.index.isin(all_positive_tables.index).any():
543+
all_positive_tables=pd.concat([all_positive_tables,combined_positive_tables])
536544

537-
# if number_table_exists:
538-
# if not number_detections_table.index.isin(all_number_tables.index).any():
539-
# all_number_tables=pd.concat([all_number_tables,number_detections_table])
545+
if number_table_exists:
546+
if not number_detections_table.index.isin(all_number_tables.index).any():
547+
all_number_tables=pd.concat([all_number_tables,number_detections_table])
540548

541549
return {
542550
"respiratory_detection": all_respiratory_detection_tables,

src/acquisition/rvdss/test.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
def update(table_key= '', first=None, last=None, force_update=False, load_email=True):
2+
3+
table_name = expected_table_names[table_key]
4+
field_names = ", ".join(
5+
f"`{name}`" for name in expected_columns[table_key])
6+
7+
field_values = ", ".join(
8+
f"%({name})s" for name in expected_columns[table_key])
9+
# check rvdss for new and/or revised data
10+
sql = f"""
11+
INSERT INTO {table_name} ({field_names})
12+
VALUES ({field_values})
13+
ON DUPLICATE KEY UPDATE
14+
`value` = %s
15+
"""
16+
print(sql)
17+
18+
respiratory_detections_cols= (
19+
"epiweek",
20+
"time_value",
21+
"issue",
22+
"geo_type",
23+
"geo_value",
24+
"sarscov2_tests",
25+
"sarscov2_positive_tests",
26+
"flu_tests",
27+
"flu_positive_tests",
28+
"fluah1n1pdm09_positive_tests",
29+
"fluah3_positive_tests",
30+
"fluauns_positive_tests",
31+
"flua_positive_tests",
32+
"flub_positive_tests",
33+
"rsv_tests",
34+
"rsv_positive_tests",
35+
"hpiv_tests",
36+
"hpiv1_positive_tests",
37+
"hpiv2_positive_tests",
38+
"hpiv3_positive_tests",
39+
"hpiv4_positive_tests",
40+
"hpivother_positive_tests",
41+
"adv_tests",
42+
"adv_positive_tests",
43+
"hmpv_tests",
44+
"hmpv_positive_tests",
45+
"evrv_tests",
46+
"evrv_positive_tests",
47+
"hcov_tests",
48+
"hcov_positive_tests",
49+
"week",
50+
"weekorder",
51+
"year"
52+
)
53+
54+
pct_positive_cols = (
55+
"epiweek",
56+
"time_value",
57+
"issue",
58+
"geo_type",
59+
"geo_value",
60+
"evrv_pct_positive",
61+
"evrv_tests",
62+
"evrv_positive_tests",
63+
"hpiv_pct_positive",
64+
"hpiv_tests",
65+
"hpiv_positive_tests",
66+
"adv_pct_positive",
67+
"adv_tests",
68+
"hcov_pct_positive",
69+
"hcov_tests",
70+
"hcov_positive_tests",
71+
"flua_pct_positive",
72+
"flub_pct_positive",
73+
"flu_tests",
74+
"flua_positive_tests",
75+
"flua_tests",
76+
"flub_tests",
77+
"flub_positive_tests",
78+
"flu_positive_tests",
79+
"flu_pct_positive",
80+
"hmpv_pct_positive",
81+
"hmpv_tests",
82+
"hmpv_positive_tests",
83+
"rsv_pct_positive",
84+
"rsv_tests",
85+
"rsv_positive_tests",
86+
"sarscov2_pct_positive",
87+
"sarscov2_tests",
88+
"sarscov2_positive_tests",
89+
"region",
90+
"week",
91+
"weekorder",
92+
"year"
93+
)
94+
95+
detections_counts_cols = (
96+
"epiweek",
97+
"time_value",
98+
"issue" ,
99+
"geo_type",
100+
"geo_value",
101+
"hpiv_positive_tests",
102+
"adv_positive_tests",
103+
"hmpv_positive_tests",
104+
"evrv_positive_tests",
105+
"hcov_positive_tests",
106+
"rsv_positive_tests",
107+
"flu_positive_tests"
108+
)
109+
110+
expected_table_names = {
111+
"respiratory_detection":"rvdss_repiratory_detections",
112+
"positive":"rvdss_pct_positive" ,
113+
"count": "rvdss_detections_counts"
114+
}
115+
116+
expected_columns = {
117+
"respiratory_detection":respiratory_detections_cols,
118+
"positive": pct_positive_cols,
119+
"count":detections_counts_cols
120+
}
121+
122+
update("count")
123+
update("positive")
124+
update("respiratory_detection")
125+
126+
import pandas as pd
127+
128+
data = pd.read_csv("positive_tests.csv")
129+
b = list(data.itertuples(index=False,name=None))

src/acquisition/rvdss/utils.py

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def preprocess_table_columns(table):
105105
table.columns = [re.sub("canada","can",t) for t in table.columns]
106106
table.columns = [re.sub(r"\bcb\b","bc",t) for t in table.columns]
107107

108-
table.columns =[re.sub(r"h1n1 2009 |h1n12009|a_h1|ah1\b", "ah1n1pdm09", s)for s in table.columns]
108+
table.columns =[re.sub(r"h1n1 2009 |h1n12009|a_h1|ah1\b|ah1pdm09", "ah1n1pdm09", s)for s in table.columns]
109109
table.columns =[re.sub(r"a_uns", "auns", s)for s in table.columns]
110110
table.columns =[re.sub(r"a_h3", "ah3", s)for s in table.columns]
111111

@@ -231,33 +231,37 @@ def get_positive_data(base_url,headers,update_date):
231231

232232

233233
def get_detections_data(base_url,headers,update_date):
234-
# Get current week and year
235-
# summary_url = base_url + "RVD_SummaryText.csv"
236-
# summary_url_response = requests.get(summary_url, headers=headers)
237-
# summary_df = pd.read_csv(io.StringIO(summary_url_response.text))
238-
# week_df = summary_df[(summary_df['Section'] == "summary") & (summary_df['Type']=="title")]
239-
# week_string = week_df.iloc[0]['Text'].lower()
240-
# current_week = int(re.search("week (.+?) ", week_string).group(1))
241-
# current_year= int(re.search(r"20\d{2}", week_string).group(0))
242-
# current_epiweek= Week(current_year,current_week)
243-
244234
# Get weekly data
245235
detections_url = base_url + "RVD_CurrentWeekTable.csv"
246236
detections_url_response = requests.get(detections_url, headers=headers)
247237
detections_url_response.encoding='UTF-8'
248238
df_detections = pd.read_csv(io.StringIO(detections_url_response.text))
249239

250-
df_detections["year"] = [int(re.search(r"20\d{2}", w).group(0)) for w in df_detections["date"]]
251-
ew = df_detections.apply(lambda x: Week(x['year'],x['week']),axis=1)
240+
if ("date" in df_detections.columns):
241+
df_detections["year"] = [int(re.search(r"20\d{2}", w).group(0)) for w in df_detections["date"]]
242+
ew = df_detections.apply(lambda x: Week(x['year'],x['week']),axis=1)
243+
df_detections.insert(0,"epiweek",[int(str(w)) for w in ew])
244+
df_detections['epiweek'] = [int(str(w)) for w in df_detections['epiweek']]
245+
else:
246+
#Get current week and year
247+
summary_url = base_url + "RVD_SummaryText.csv"
248+
summary_url_response = requests.get(summary_url, headers=headers)
249+
summary_df = pd.read_csv(io.StringIO(summary_url_response.text))
250+
week_df = summary_df[(summary_df['Section'] == "summary") & (summary_df['Type']=="title")]
251+
week_string = week_df.iloc[0]['Text'].lower()
252+
current_week = int(re.search("week (.+?) ", week_string).group(1))
253+
current_year= int(re.search(r"20\d{2}", week_string).group(0))
254+
current_epiweek= Week(current_year,current_week)
255+
df_detections['epiweek'] = int(str(current_epiweek))
256+
df_detections['date'] = current_epiweek.enddate()
257+
252258

253259
# swap order of names from a_b to b_a
254260
df_detections = df_detections.rename(columns=lambda x: '_'.join(x.split('_')[1:]+x.split('_')[:1]))
255-
df_detections.insert(0,"epiweek",[int(str(w)) for w in ew])
256-
df_detections['epiweek'] = [int(str(w)) for w in df_detections['epiweek']]
257261
df_detections.insert(2,"issue",update_date)
258-
259262
df_detections=preprocess_table_columns(df_detections)
260263
df_detections.columns=[re.sub(r' ','_',c) for c in df_detections.columns]
264+
261265
df_detections=df_detections.rename(columns={'reportinglaboratory':"geo_value",'date':"time_value"})
262266
df_detections['geo_value'] = [abbreviate_geo(g) for g in df_detections['geo_value']]
263267
df_detections['geo_type'] = [create_geo_types(g,"lab") for g in df_detections['geo_value']]

tests/acquisition/rvdss/test_pull_historic.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,9 @@
4545
'hmpv%.1':1, 'qc tests':1, 'hmpv%.2':1, 'on tests':1,
4646
'hmpv%.3':1, 'pr tests':1, 'hmpv%.4':1, 'bc tests':1,
4747
'hmpv%.5':1}]),
48-
pd.DataFrame(columns=["week end","pos_tests","percent_pos"])]
48+
pd.DataFrame(columns=["week end","pos_tests","percent_pos"]),
49+
pd.DataFrame([{"week":32,"week end":"2017-08-17"}]),
50+
pd.DataFrame({"week":[25,26],"week end":["2017-08-12","2017-08-19"]})]
4951

5052
expected_edge_case_tables=[
5153
pd.DataFrame(columns=['week', 'week end', 'canada tests', 'entero/rhino%', 'at tests',
@@ -65,20 +67,25 @@
6567
'hmpv%.1':1, 'qc tests':1, 'hmpv%.2':1, 'on tests':1,
6668
'hmpv%.3':1, 'pr tests':1, 'hmpv%.4':1, 'bc tests':1,
6769
'hmpv%.5':1}]),
68-
pd.DataFrame(columns=["week end","pos_tests","percent_pos"])]
70+
pd.DataFrame(columns=["week end","pos_tests","percent_pos"]),
71+
pd.DataFrame([{"week":32,"week end":"2017-08-12"}]),
72+
pd.DataFrame([{"week":25,"week end":"2017-08-12"}])]
6973

7074
example_edge_case_captions=[
7175
[t for t in captions if "Entero" in t.text][0],
7276
[t for t in captions if "Adeno" in t.text][0],
7377
[t for t in captions if "RSV" in t.text][0],
7478
[t for t in captions if "RSV" in t.text][0],
7579
[t for t in captions if "hMPV" in t.text][0],
76-
[t for t in captions if "hMPV" in t.text][0]]
80+
[t for t in captions if "hMPV" in t.text][0],
81+
[t for t in captions if "Influenza" in t.text][0],
82+
[t for t in captions if "Number" in t.text][0]]
7783

7884
example_edge_case_seasons=[["2017","2018"],["2017","2018"],["2017","2018"],
79-
["2015","2016"],["2022","2023"],["2021","2022"]]
85+
["2015","2016"],["2022","2023"],["2021","2022"],
86+
["2016","2017"],["2017","2018"]]
8087

81-
example_edge_case_weeks=[35,35,47,41,11,10]
88+
example_edge_case_weeks=[35,35,47,41,11,10,32,26]
8289

8390
class TestPullHistoric():
8491

0 commit comments

Comments
 (0)