Skip to content

Commit fc48086

Browse files
Merge pull request #94 from opentargets/tskir-1703-ontoma-implementation
General OnToma module for PySpark parsers + PhenoDigm & PanelApp implementation
2 parents 92c75ec + e8b66bc commit fc48086

File tree

5 files changed

+98
-8
lines changed

5 files changed

+98
-8
lines changed

common/ontology.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
import logging
2+
import random
3+
import time
4+
5+
from ontoma.interface import OnToma
6+
from pandarallel import pandarallel
7+
from pyspark.sql.functions import col, when
8+
9+
ONTOMA_MAX_ATTEMPTS = 3
10+
pandarallel.initialize()
11+
12+
13+
def _simple_retry(func, **kwargs):
14+
"""Simple retry handling for functions. Cannot be a decorator, so that the functions could still be pickled."""
15+
for attempt in range(1, ONTOMA_MAX_ATTEMPTS + 1):
16+
try:
17+
return func(**kwargs)
18+
except:
19+
# If this is not the last attempt, wait until the next one.
20+
if attempt != ONTOMA_MAX_ATTEMPTS:
21+
time.sleep(5 + 10 * random.random())
22+
logging.error(f'OnToma lookup failed for {kwargs!r}')
23+
return []
24+
25+
26+
def _ontoma_udf(row, ontoma_instance):
27+
"""Try to map first by disease name (because that branch of OnToma is more stable), then by disease ID."""
28+
disease_name = row['diseaseFromSource']
29+
disease_id = row['diseaseFromSourceId'].replace('_', ':') if row['diseaseFromSourceId'] else None
30+
mappings = []
31+
if disease_name:
32+
mappings = _simple_retry(ontoma_instance.find_term, query=disease_name, code=False)
33+
if not mappings and disease_id and ':' in disease_id:
34+
mappings = _simple_retry(ontoma_instance.find_term, query=disease_id, code=True)
35+
return [m.id_ot_schema for m in mappings]
36+
37+
38+
def add_efo_mapping(evidence_strings, spark_instance, ontoma_cache_dir=None):
39+
"""Given evidence strings with diseaseFromSource and diseaseFromSourceId fields, try to populate EFO mapping
40+
field diseaseFromSourceMappedId. In case there are multiple matches, the evidence strings will be exploded
41+
accordingly.
42+
43+
Currently, both source columns (diseaseFromSource and diseaseFromSourceId) need to be present in the original
44+
schema, although they do not have to be populated for all rows."""
45+
logging.info('Collect all distinct (disease name, disease ID) pairs.')
46+
disease_info_to_map = (
47+
evidence_strings
48+
.select('diseaseFromSource', 'diseaseFromSourceId')
49+
.distinct()
50+
.toPandas()
51+
)
52+
53+
logging.info('Initialise OnToma instance')
54+
ontoma_instance = OnToma(cache_dir=ontoma_cache_dir)
55+
56+
logging.info('Map disease information to EFO.')
57+
disease_info_to_map['diseaseFromSourceMappedId'] = disease_info_to_map.parallel_apply(
58+
_ontoma_udf, args=(ontoma_instance,), axis=1
59+
)
60+
disease_info_to_map = disease_info_to_map.explode('diseaseFromSourceMappedId')
61+
62+
logging.info('Join the resulting information into the evidence strings.')
63+
disease_info_df = (
64+
spark_instance
65+
.createDataFrame(disease_info_to_map.astype(str))
66+
.withColumn(
67+
'diseaseFromSourceMappedId',
68+
when(col('diseaseFromSourceMappedId') != 'nan', col('diseaseFromSourceMappedId'))
69+
)
70+
)
71+
return evidence_strings.join(
72+
disease_info_df,
73+
on=['diseaseFromSource', 'diseaseFromSourceId'],
74+
how='left'
75+
)

envs/environment-lock.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -249,5 +249,7 @@ dependencies:
249249
- zlib=1.2.11
250250
- zstd=1.5.0
251251
- pip:
252+
- dill==0.3.4
252253
- obonet==0.3.0
253-
- ontoma==0.0.17
254+
- ontoma==1.0.0
255+
- pandarallel==1.5.2

envs/environment.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,5 @@ dependencies:
1515
- snakemake==6.0.0
1616
- tqdm=4.58.0
1717
- pip:
18-
- ontoma==0.0.17
18+
- ontoma==1.0.0
19+
- pandarallel==1.5.2

modules/PanelApp.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
array, array_distinct, col, collect_set, concat, explode, lit, regexp_extract, regexp_replace, split, trim, when
1414
)
1515

16+
from common.ontology import add_efo_mapping
17+
1618

1719
class PanelAppEvidenceGenerator:
1820

@@ -238,6 +240,9 @@ def generate_panelapp_evidence(
238240
.distinct()
239241
)
240242

243+
# Add EFO mapping information.
244+
panelapp_df = add_efo_mapping(evidence_strings=panelapp_df, spark_instance=self.spark)
245+
241246
logging.info('Save data.')
242247
with tempfile.TemporaryDirectory() as tmp_dir_name:
243248
(

modules/PhenoDigm.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
import requests
1717
from retry import retry
1818

19+
from common.ontology import add_efo_mapping
20+
1921

2022
# The tables and their fields to fetch from SOLR. Other tables (not currently used): gene, disease_gene_summary.
2123
IMPC_SOLR_TABLES = {
@@ -410,13 +412,18 @@ def generate_phenodigm_evidence_strings(self, score_cutoff):
410412
# Add constant value columns.
411413
.withColumn('datasourceId', pf.lit('phenodigm'))
412414
.withColumn('datatypeId', pf.lit('animal_model'))
415+
)
413416

414-
# Ensure stable column order.
415-
.select('biologicalModelAllelicComposition', 'biologicalModelGeneticBackground', 'biologicalModelId',
416-
'datasourceId', 'datatypeId', 'diseaseFromSource', 'diseaseFromSourceId',
417-
'diseaseModelAssociatedHumanPhenotypes', 'diseaseModelAssociatedModelPhenotypes', 'literature',
418-
'resourceScore', 'targetFromSourceId', 'targetInModel', 'targetInModelEnsemblId',
419-
'targetInModelMgiId')
417+
# Add EFO mapping information.
418+
self.evidence = add_efo_mapping(evidence_strings=self.evidence, spark_instance=self.spark,
419+
ontoma_cache_dir=self.cache_dir)
420+
421+
# Ensure stable column order.
422+
self.evidence = self.evidence.select(
423+
'biologicalModelAllelicComposition', 'biologicalModelGeneticBackground', 'biologicalModelId',
424+
'datasourceId', 'datatypeId', 'diseaseFromSource', 'diseaseFromSourceId', 'diseaseFromSourceMappedId',
425+
'diseaseModelAssociatedHumanPhenotypes', 'diseaseModelAssociatedModelPhenotypes', 'literature',
426+
'resourceScore', 'targetFromSourceId', 'targetInModel', 'targetInModelEnsemblId', 'targetInModelMgiId'
420427
)
421428

422429
def generate_mouse_phenotypes_dataset(self):

0 commit comments

Comments
 (0)