Merge pull request #94 from opentargets/tskir-1703-ontoma-implementation

ireneisdoomed · web-flow · commit fc480865786c · 2021-09-15T08:40:05.000+01:00
General OnToma module for PySpark parsers + PhenoDigm &amp; PanelApp implementation
diff --git a/common/ontology.py b/common/ontology.py
@@ -0,0 +1,75 @@
+import logging
+import random
+import time
+
+from ontoma.interface import OnToma
+from pandarallel import pandarallel
+from pyspark.sql.functions import col, when
+
+ONTOMA_MAX_ATTEMPTS = 3
+pandarallel.initialize()
+
+
+def _simple_retry(func, **kwargs):
+    """Simple retry handling for functions. Cannot be a decorator, so that the functions could still be pickled."""
+    for attempt in range(1, ONTOMA_MAX_ATTEMPTS + 1):
+        try:
+            return func(**kwargs)
+        except:
+            # If this is not the last attempt, wait until the next one.
+            if attempt != ONTOMA_MAX_ATTEMPTS:
+                time.sleep(5 + 10 * random.random())
+    logging.error(f'OnToma lookup failed for {kwargs!r}')
+    return []
+
+
+def _ontoma_udf(row, ontoma_instance):
+    """Try to map first by disease name (because that branch of OnToma is more stable), then by disease ID."""
+    disease_name = row['diseaseFromSource']
+    disease_id = row['diseaseFromSourceId'].replace('_', ':') if row['diseaseFromSourceId'] else None
+    mappings = []
+    if disease_name:
+        mappings = _simple_retry(ontoma_instance.find_term, query=disease_name, code=False)
+    if not mappings and disease_id and ':' in disease_id:
+        mappings = _simple_retry(ontoma_instance.find_term, query=disease_id, code=True)
+    return [m.id_ot_schema for m in mappings]
+
+
+def add_efo_mapping(evidence_strings, spark_instance, ontoma_cache_dir=None):
+    """Given evidence strings with diseaseFromSource and diseaseFromSourceId fields, try to populate EFO mapping
+    field diseaseFromSourceMappedId. In case there are multiple matches, the evidence strings will be exploded
+    accordingly.
+
+    Currently, both source columns (diseaseFromSource and diseaseFromSourceId) need to be present in the original
+    schema, although they do not have to be populated for all rows."""
+    logging.info('Collect all distinct (disease name, disease ID) pairs.')
+    disease_info_to_map = (
+        evidence_strings
+        .select('diseaseFromSource', 'diseaseFromSourceId')
+        .distinct()
+        .toPandas()
+    )
+
+    logging.info('Initialise OnToma instance')
+    ontoma_instance = OnToma(cache_dir=ontoma_cache_dir)
+
+    logging.info('Map disease information to EFO.')
+    disease_info_to_map['diseaseFromSourceMappedId'] = disease_info_to_map.parallel_apply(
+        _ontoma_udf, args=(ontoma_instance,), axis=1
+    )
+    disease_info_to_map = disease_info_to_map.explode('diseaseFromSourceMappedId')
+
+    logging.info('Join the resulting information into the evidence strings.')
+    disease_info_df = (
+        spark_instance
+        .createDataFrame(disease_info_to_map.astype(str))
+        .withColumn(
+            'diseaseFromSourceMappedId',
+            when(col('diseaseFromSourceMappedId') != 'nan', col('diseaseFromSourceMappedId'))
+        )
+    )
+    return evidence_strings.join(
+        disease_info_df,
+        on=['diseaseFromSource', 'diseaseFromSourceId'],
+        how='left'
+    )
diff --git a/envs/environment-lock.yml b/envs/environment-lock.yml
@@ -249,5 +249,7 @@ dependencies:
   - zlib=1.2.11
   - zstd=1.5.0
   - pip:
+    - dill==0.3.4
     - obonet==0.3.0
-    - ontoma==0.0.17
+    - ontoma==1.0.0
+    - pandarallel==1.5.2
diff --git a/envs/environment.yml b/envs/environment.yml
@@ -15,4 +15,5 @@ dependencies:
   - snakemake==6.0.0
   - tqdm=4.58.0
   - pip:
-    - ontoma==0.0.17
+    - ontoma==1.0.0
+    - pandarallel==1.5.2
diff --git a/modules/PanelApp.py b/modules/PanelApp.py
@@ -13,6 +13,8 @@
     array, array_distinct, col, collect_set, concat, explode, lit, regexp_extract, regexp_replace, split, trim, when
 )
 
+from common.ontology import add_efo_mapping
+
 
 class PanelAppEvidenceGenerator:
 
@@ -238,6 +240,9 @@ def generate_panelapp_evidence(
             .distinct()
         )
 
+        # Add EFO mapping information.
+        panelapp_df = add_efo_mapping(evidence_strings=panelapp_df, spark_instance=self.spark)
+
         logging.info('Save data.')
         with tempfile.TemporaryDirectory() as tmp_dir_name:
             (
diff --git a/modules/PhenoDigm.py b/modules/PhenoDigm.py
@@ -16,6 +16,8 @@
 import requests
 from retry import retry
 
+from common.ontology import add_efo_mapping
+
 
 # The tables and their fields to fetch from SOLR. Other tables (not currently used): gene, disease_gene_summary.
 IMPC_SOLR_TABLES = {
@@ -410,13 +412,18 @@ def generate_phenodigm_evidence_strings(self, score_cutoff):
             # Add constant value columns.
             .withColumn('datasourceId', pf.lit('phenodigm'))
             .withColumn('datatypeId', pf.lit('animal_model'))
+        )
 
-            # Ensure stable column order.
-            .select('biologicalModelAllelicComposition', 'biologicalModelGeneticBackground', 'biologicalModelId',
-                    'datasourceId', 'datatypeId', 'diseaseFromSource', 'diseaseFromSourceId',
-                    'diseaseModelAssociatedHumanPhenotypes', 'diseaseModelAssociatedModelPhenotypes', 'literature',
-                    'resourceScore', 'targetFromSourceId', 'targetInModel', 'targetInModelEnsemblId',
-                    'targetInModelMgiId')
+        # Add EFO mapping information.
+        self.evidence = add_efo_mapping(evidence_strings=self.evidence, spark_instance=self.spark,
+                                        ontoma_cache_dir=self.cache_dir)
+
+        # Ensure stable column order.
+        self.evidence = self.evidence.select(
+            'biologicalModelAllelicComposition', 'biologicalModelGeneticBackground', 'biologicalModelId',
+            'datasourceId', 'datatypeId', 'diseaseFromSource', 'diseaseFromSourceId', 'diseaseFromSourceMappedId',
+            'diseaseModelAssociatedHumanPhenotypes', 'diseaseModelAssociatedModelPhenotypes', 'literature',
+            'resourceScore', 'targetFromSourceId', 'targetInModel', 'targetInModelEnsemblId', 'targetInModelMgiId'
         )
 
     def generate_mouse_phenotypes_dataset(self):

Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,8 @@`
`13`	`13`	`array, array_distinct, col, collect_set, concat, explode, lit, regexp_extract, regexp_replace, split, trim, when`
`14`	`14`	`)`
`15`	`15`
	`16`	`+from common.ontology import add_efo_mapping`
	`17`	`+`
`16`	`18`
`17`	`19`	`class PanelAppEvidenceGenerator:`
`18`	`20`
`@@ -238,6 +240,9 @@ def generate_panelapp_evidence(`
`238`	`240`	`.distinct()`
`239`	`241`	`)`
`240`	`242`
	`243`	`+ # Add EFO mapping information.`
	`244`	`+ panelapp_df = add_efo_mapping(evidence_strings=panelapp_df, spark_instance=self.spark)`
	`245`	`+`
`241`	`246`	`logging.info('Save data.')`
`242`	`247`	`with tempfile.TemporaryDirectory() as tmp_dir_name:`
`243`	`248`	`(`