add extra edge cases, and duplicate checking

cchuong · cchuong · commit 02de2eb1992d · 2025-05-15T16:35:30.000-07:00
diff --git a/src/acquisition/rvdss/pull_historic.py b/src/acquisition/rvdss/pull_historic.py
@@ -342,13 +342,20 @@ def fix_edge_cases(table,season,caption,current_week):
             #  In week 47 of the 2017-2018 season, a date is written as 201-11-25,
             #  instead of 2017-11-25
             table.loc[table['week'] == 47, 'week end'] = "2017-11-25"
+        elif current_week == 26 and "number" in caption.text.lower():
+            #  anomolous row with decimal counts that differs a lot from the next week.
+            table = table[table.week != 26]
     elif season[0] == '2015' and current_week == 41:
         # In week 41 of the 2015-2016 season, a date written in m-d-y format not d-m-y
         table=table.replace("10-17-2015","17-10-2015",regex=True)
     elif season[0] == '2022' and current_week == 11 and "hmpv" in caption.text.lower():
         #  In week 11 of the 2022-2023 season, in the positive hmpv table,
         # a date is written as 022-09-03, instead of 2022-09-03
          table.loc[table['week'] == 35, 'week end'] = "2022-09-03"
+    elif season[0] == '2016' and current_week == 32 and "influenza" in caption.text.lower():
+        #  In week 11 of the 2022-2023 season, in the positive hmpv table,
+        # a date is written as 022-09-03, instead of 2022-09-03
+         table.loc[table['week'] == 32, 'week end'] = "2017-08-12"
     return(table)
 
 def fetch_one_season_from_report(url):
@@ -389,6 +396,8 @@ def fetch_one_season_from_report(url):
 
         positive_tables=[]
         number_table_exists = False
+        respiratory_detection_table_exists = False
+        positive_table_exists = False
         for i in range(len(captions)):
             caption=captions[i]
             tab = caption.find_next('table')
@@ -456,7 +465,8 @@ def fetch_one_season_from_report(url):
 
             table = fix_edge_cases(table, season[0], caption, current_week)    
 
-# check if both ah1 and h1n1 are given. If so drop one since they are the same virus and ah1 is always empty
+            # check if both ah1 and h1n1 are given. If so drop one since they are the same virus and ah1 is 
+            # always empty
             table = drop_ah1_columns(table)
 
             # Rename columns
@@ -523,20 +533,18 @@ def fetch_one_season_from_report(url):
         # If not, add the weeks tables into the season table
 
         # check for deduplication pandas
-        all_respiratory_detection_tables= pd.concat([all_respiratory_detection_tables,respiratory_detection_table])
-        all_positive_tables=pd.concat([all_positive_tables,combined_positive_tables])
         if not number_detections_table.index.isin(all_number_tables.index).any():
             all_number_tables=pd.concat([all_number_tables,number_detections_table])
         
-        # if not respiratory_detection_table.index.isin(all_respiratory_detection_tables.index).any():
-        #     all_respiratory_detection_tables= pd.concat([all_respiratory_detection_tables,respiratory_detection_table])
+        if not respiratory_detection_table.index.isin(all_respiratory_detection_tables.index).any():
+              all_respiratory_detection_tables= pd.concat([all_respiratory_detection_tables,respiratory_detection_table])
 
-        # if not combined_positive_tables.index.isin(all_positive_tables.index).any():
-        #     all_positive_tables=pd.concat([all_positive_tables,combined_positive_tables])
+        if not combined_positive_tables.index.isin(all_positive_tables.index).any():
+            all_positive_tables=pd.concat([all_positive_tables,combined_positive_tables])
 
-        # if number_table_exists:
-        #     if not number_detections_table.index.isin(all_number_tables.index).any():
-        #         all_number_tables=pd.concat([all_number_tables,number_detections_table])
+        if number_table_exists:
+            if not number_detections_table.index.isin(all_number_tables.index).any():
+                all_number_tables=pd.concat([all_number_tables,number_detections_table])
 
     return {
         "respiratory_detection": all_respiratory_detection_tables,
diff --git a/src/acquisition/rvdss/test.py b/src/acquisition/rvdss/test.py
@@ -0,0 +1,129 @@
+def update(table_key= '', first=None, last=None, force_update=False, load_email=True):
+
+    table_name = expected_table_names[table_key]
+    field_names = ", ".join(
+            f"`{name}`" for name in expected_columns[table_key])
+    
+    field_values = ", ".join(
+           f"%({name})s" for name in expected_columns[table_key])
+    # check rvdss for new and/or revised data
+    sql = f"""
+    INSERT INTO {table_name} ({field_names})
+    VALUES ({field_values})
+    ON DUPLICATE KEY UPDATE
+      `value` = %s
+    """
+    print(sql)
+
+respiratory_detections_cols= (
+    "epiweek",
+    "time_value",
+    "issue",
+    "geo_type",
+    "geo_value",
+    "sarscov2_tests",
+    "sarscov2_positive_tests",
+    "flu_tests",
+    "flu_positive_tests",
+    "fluah1n1pdm09_positive_tests",
+    "fluah3_positive_tests",
+    "fluauns_positive_tests",
+    "flua_positive_tests",
+    "flub_positive_tests",
+    "rsv_tests",
+    "rsv_positive_tests",
+    "hpiv_tests",
+    "hpiv1_positive_tests",
+    "hpiv2_positive_tests",
+    "hpiv3_positive_tests",
+    "hpiv4_positive_tests",
+    "hpivother_positive_tests",
+    "adv_tests",
+    "adv_positive_tests",
+    "hmpv_tests",
+    "hmpv_positive_tests",
+    "evrv_tests",
+    "evrv_positive_tests",
+    "hcov_tests",
+    "hcov_positive_tests",
+    "week",
+    "weekorder",
+    "year"
+)
+
+pct_positive_cols = ( 
+    "epiweek",
+    "time_value",
+    "issue",
+    "geo_type",
+    "geo_value",
+    "evrv_pct_positive",
+    "evrv_tests",
+    "evrv_positive_tests",
+    "hpiv_pct_positive",
+    "hpiv_tests",
+    "hpiv_positive_tests",
+    "adv_pct_positive",
+    "adv_tests",
+    "hcov_pct_positive",
+    "hcov_tests",
+    "hcov_positive_tests",
+    "flua_pct_positive",
+    "flub_pct_positive",
+    "flu_tests",
+    "flua_positive_tests",
+    "flua_tests",
+    "flub_tests",
+    "flub_positive_tests",
+    "flu_positive_tests",
+    "flu_pct_positive",
+    "hmpv_pct_positive",
+    "hmpv_tests",
+    "hmpv_positive_tests",
+    "rsv_pct_positive",
+    "rsv_tests",
+    "rsv_positive_tests",
+    "sarscov2_pct_positive",
+    "sarscov2_tests",
+    "sarscov2_positive_tests",
+    "region",
+    "week",
+    "weekorder",
+    "year"
+)
+
+detections_counts_cols = (
+    "epiweek",
+    "time_value",
+    "issue" ,
+    "geo_type",
+    "geo_value",
+    "hpiv_positive_tests",
+    "adv_positive_tests",
+    "hmpv_positive_tests",
+    "evrv_positive_tests",
+    "hcov_positive_tests",
+    "rsv_positive_tests",
+    "flu_positive_tests"
+)
+
+expected_table_names = {
+    "respiratory_detection":"rvdss_repiratory_detections",
+    "positive":"rvdss_pct_positive" ,
+    "count": "rvdss_detections_counts"
+}
+
+expected_columns = {
+    "respiratory_detection":respiratory_detections_cols,
+    "positive": pct_positive_cols,
+    "count":detections_counts_cols
+}
+
+update("count")
+update("positive")
+update("respiratory_detection")
+
+import pandas as pd
+
+data = pd.read_csv("positive_tests.csv")
+b = list(data.itertuples(index=False,name=None))
diff --git a/src/acquisition/rvdss/utils.py b/src/acquisition/rvdss/utils.py
@@ -105,7 +105,7 @@ def preprocess_table_columns(table):
     table.columns = [re.sub("canada","can",t) for t in table.columns]
     table.columns = [re.sub(r"\bcb\b","bc",t) for t in table.columns]
 
-    table.columns =[re.sub(r"h1n1 2009 |h1n12009|a_h1|ah1\b", "ah1n1pdm09", s)for s in table.columns]
+    table.columns =[re.sub(r"h1n1 2009 |h1n12009|a_h1|ah1\b|ah1pdm09", "ah1n1pdm09", s)for s in table.columns]
     table.columns =[re.sub(r"a_uns", "auns", s)for s in table.columns]
     table.columns =[re.sub(r"a_h3", "ah3", s)for s in table.columns]
 
@@ -231,33 +231,37 @@ def get_positive_data(base_url,headers,update_date):
 
 
 def get_detections_data(base_url,headers,update_date):
-    # Get current week and year
-    # summary_url =  base_url + "RVD_SummaryText.csv"
-    # summary_url_response = requests.get(summary_url, headers=headers)
-    # summary_df = pd.read_csv(io.StringIO(summary_url_response.text))
-    # week_df = summary_df[(summary_df['Section'] == "summary") & (summary_df['Type']=="title")]
-    # week_string = week_df.iloc[0]['Text'].lower()
-    # current_week = int(re.search("week (.+?) ", week_string).group(1))
-    # current_year= int(re.search(r"20\d{2}", week_string).group(0))
-    # current_epiweek= Week(current_year,current_week)
-
     # Get weekly data
     detections_url = base_url + "RVD_CurrentWeekTable.csv"
     detections_url_response = requests.get(detections_url, headers=headers)
     detections_url_response.encoding='UTF-8'
     df_detections = pd.read_csv(io.StringIO(detections_url_response.text))
     
-    df_detections["year"] = [int(re.search(r"20\d{2}", w).group(0)) for w in  df_detections["date"]] 
-    ew = df_detections.apply(lambda x: Week(x['year'],x['week']),axis=1)
+    if ("date" in df_detections.columns):
+        df_detections["year"] = [int(re.search(r"20\d{2}", w).group(0)) for w in  df_detections["date"]] 
+        ew = df_detections.apply(lambda x: Week(x['year'],x['week']),axis=1)
+        df_detections.insert(0,"epiweek",[int(str(w)) for w in ew])
+        df_detections['epiweek'] = [int(str(w)) for w in df_detections['epiweek']]
+    else:
+        #Get current week and year
+        summary_url =  base_url + "RVD_SummaryText.csv"
+        summary_url_response = requests.get(summary_url, headers=headers)
+        summary_df = pd.read_csv(io.StringIO(summary_url_response.text))
+        week_df = summary_df[(summary_df['Section'] == "summary") & (summary_df['Type']=="title")]
+        week_string = week_df.iloc[0]['Text'].lower()
+        current_week = int(re.search("week (.+?) ", week_string).group(1))
+        current_year= int(re.search(r"20\d{2}", week_string).group(0))
+        current_epiweek= Week(current_year,current_week)
+        df_detections['epiweek'] = int(str(current_epiweek))
+        df_detections['date'] = current_epiweek.enddate()
+
 
     # swap order of names from a_b to b_a
     df_detections = df_detections.rename(columns=lambda x: '_'.join(x.split('_')[1:]+x.split('_')[:1]))
-    df_detections.insert(0,"epiweek",[int(str(w)) for w in ew])
-    df_detections['epiweek'] = [int(str(w)) for w in df_detections['epiweek']]
     df_detections.insert(2,"issue",update_date)
-    
     df_detections=preprocess_table_columns(df_detections)
     df_detections.columns=[re.sub(r' ','_',c) for c in df_detections.columns]
+    
     df_detections=df_detections.rename(columns={'reportinglaboratory':"geo_value",'date':"time_value"})
     df_detections['geo_value'] = [abbreviate_geo(g) for g in df_detections['geo_value']]
     df_detections['geo_type'] = [create_geo_types(g,"lab") for g in df_detections['geo_value']]
diff --git a/tests/acquisition/rvdss/test_pull_historic.py b/tests/acquisition/rvdss/test_pull_historic.py
@@ -45,7 +45,9 @@
        'hmpv%.1':1, 'qc tests':1, 'hmpv%.2':1, 'on tests':1,
        'hmpv%.3':1, 'pr tests':1, 'hmpv%.4':1, 'bc tests':1,
        'hmpv%.5':1}]),
-    pd.DataFrame(columns=["week end","pos_tests","percent_pos"])]
+    pd.DataFrame(columns=["week end","pos_tests","percent_pos"]),
+    pd.DataFrame([{"week":32,"week end":"2017-08-17"}]),
+    pd.DataFrame({"week":[25,26],"week end":["2017-08-12","2017-08-19"]})]
 
 expected_edge_case_tables=[
     pd.DataFrame(columns=['week', 'week end', 'canada tests', 'entero/rhino%', 'at tests',
@@ -65,20 +67,25 @@
        'hmpv%.1':1, 'qc tests':1, 'hmpv%.2':1, 'on tests':1,
        'hmpv%.3':1, 'pr tests':1, 'hmpv%.4':1, 'bc tests':1,
        'hmpv%.5':1}]),
-    pd.DataFrame(columns=["week end","pos_tests","percent_pos"])]
+    pd.DataFrame(columns=["week end","pos_tests","percent_pos"]),
+    pd.DataFrame([{"week":32,"week end":"2017-08-12"}]),
+    pd.DataFrame([{"week":25,"week end":"2017-08-12"}])]
 
 example_edge_case_captions=[
     [t for t in captions if "Entero" in t.text][0],
     [t for t in captions if "Adeno" in t.text][0],
     [t for t in captions if "RSV" in t.text][0],
     [t for t in captions if "RSV" in t.text][0],
     [t for t in captions if "hMPV" in t.text][0],
-    [t for t in captions if "hMPV" in t.text][0]]
+    [t for t in captions if "hMPV" in t.text][0],
+    [t for t in captions if "Influenza" in t.text][0],
+    [t for t in captions if "Number" in t.text][0]]
 
 example_edge_case_seasons=[["2017","2018"],["2017","2018"],["2017","2018"],
-                           ["2015","2016"],["2022","2023"],["2021","2022"]]
+                           ["2015","2016"],["2022","2023"],["2021","2022"],
+                           ["2016","2017"],["2017","2018"]]
 
-example_edge_case_weeks=[35,35,47,41,11,10]
+example_edge_case_weeks=[35,35,47,41,11,10,32,26]
 
 class TestPullHistoric():