Newer
Older
from collections import defaultdict
import nedrex
from nedrex.core import get_nodes, get_edges, get_api_key
from drugstone.management.includes.NodeCache import NodeCache
from drugstone.management.includes import DatasetLoader
from drugstone.models import PPIDataset
def iter_node_collection(coll_name, eval):
offset = 0
limit = 10000
while True:
result = get_nodes(coll_name, offset=offset, limit=limit)
if not result:
return
for node in result:
eval(node)
offset += limit
def iter_edge_collection(coll_name, eval):
offset = 0
limit = 10000
while True:
result = get_edges(coll_name, offset=offset, limit=limit)
if not result:
return
for edge in result:
eval(edge)
offset += limit
def identify_updates(new_list, old_list):
u = list()
c = list()
for id in new_list:
if id not in old_list:
elif new_list[id] != old_list[id]:
old_list[id].update(new_list[id])
u.append(old_list[id])
return u, c
def format_list(l):
if l is not None and len(l) > 0:
s = str(l)[1:]
return s[:len(s) - 1].replace("'", "")
def to_id(string):
idx = string.index('.')
return string[idx + 1:]
class NedrexImporter:
cache: NodeCache = None
url = str = ''
licenced_url: str = ''
unlicenced_url: str = ''
licenced_on: bool = True
api_key: str = None
def __init__(self, base_url_licenced, base_url_unlicenced, cache: NodeCache):
self.licenced_url = base_url_licenced
self.unlicenced_url = base_url_unlicenced
self.set_licenced(False)
def get_api_key(self):
if self.api_key is None:
self.api_key = get_api_key(accept_eula=True)
return self.api_key
def set_licenced(self, on):
if on == self.licenced_on:
return
self.url = self.licenced_url if on else self.unlicenced_url
nedrex.config.set_url_base(self.url)
if on:
nedrex.config.set_api_key(self.get_api_key())
self.licenced_on = on
self.set_licenced(False)
gene_to_prots = defaultdict(lambda: set())
AndiMajore
committed
def format_prot_name(name):
if '{' in name:
idx1 = name.index('{')
adjusted_name = name[:idx1 - 1].strip() if idx1 > 0 else ''
if '=' in adjusted_name:
idx2 = adjusted_name.index('=')
return adjusted_name[idx2+1:].strip()
return adjusted_name
return name
def add_protein(node):
id = to_id(node['primaryDomainId'])
AndiMajore
committed
name = format_prot_name(node['geneName'])
gene = name
AndiMajore
committed
name = format_prot_name(node['synonyms'][0])
proteins[id] = models.Protein(uniprot_code=id, protein_name=name, gene=gene)
def add_edges(edge):
id = to_id(edge['sourceDomainId'])
protein.entrez = to_id(edge['targetDomainId'])
AndiMajore
committed
gene_to_prots[protein.entrez].add(id)
def add_genes(node):
id = to_id(node['primaryDomainId'])
for prot_id in gene_to_prots[id]:
try:
protein.protein_name = node['synonyms'][0]
except:
pass
iter_node_collection('protein', add_protein)
iter_edge_collection('protein_encoded_by_gene', add_edges)
with_entrez = dict()
for ids in gene_to_prots.values():
for id in ids:
with_entrez[id] = proteins[id]
proteins = with_entrez
iter_node_collection('gene', add_genes)
(updates, creates) = identify_updates(proteins, self.cache.proteins)
for u in updates:
u.save()
models.Protein.objects.bulk_create(creates)
for protein in creates:
self.cache.proteins[protein.uniprot_code] = protein
self.cache.protein_updates.add(protein.uniprot_code)
AndiMajore
committed
models.Protein.objects.bulk_create(proteins.values())
self.cache.proteins = proteins
return len(self.cache.proteins)
self.set_licenced(False)
id = to_id(node['primaryDomainId'])
drugs[id] = models.Drug(drug_id=id, name=node['displayName'], status=format_list(node['drugGroups']))
iter_node_collection('drug', add_drug)
if update:
(updates, creates) = identify_updates(drugs, self.cache.drugs)
for u in updates:
u.save()
models.Drug.objects.bulk_create(creates)
for drug in creates:
self.cache.drug_updates.add(drug.drug_id)
self.cache.drugs[drug.drug_id] = drug
AndiMajore
committed
models.Drug.objects.bulk_create(drugs.values())
def import_disorders(self, update):
disorders = dict()
if update:
id = to_id(node['primaryDomainId'])
AndiMajore
committed
disorders[id] = models.Disorder(mondo_id=id, label=node['displayName'], icd10=format_list(node['icd10']))
iter_node_collection('disorder', add_disorder)
if update:
(updates, creates) = identify_updates(disorders, self.cache.disorders)
for u in updates:
u.save()
models.Disorder.objects.bulk_create(creates)
for disorder in creates:
self.cache.disorder_updates.add(disorder.mondo_id)
self.cache.disorders[disorder.mondo_id] = disorder
AndiMajore
committed
models.Disorder.objects.bulk_create(disorders.values())
self.cache.disorders = disorders
return len(self.cache.disorders)
def import_drug_target_interactions(self, dataset, update):
licenced = dataset.licenced
self.set_licenced(licenced)
self.cache.init_drugs()
self.cache.init_proteins()
bulk = set()
existing = set()
if update:
for edge in models.ProteinDrugInteraction.objects.filter(pdi_dataset=dataset):
existing.add(edge.__hash__())
source_datasets = dict()
source_is_licenced = dict()
def get_dataset(source):
if source not in source_datasets:
source_datasets[source] = DatasetLoader.get_pdi_nedrex_dataset(self.url, licenced, source)
return source_datasets[source]
def is_licenced(source):
if source not in source_is_licenced:
source_is_licenced[source] = DatasetLoader.is_licenced_pdi_source(source)
return source_is_licenced[source]
drug = self.cache.get_drug_by_drugbank(to_id(edge['sourceDomainId']))
protein = self.cache.get_protein_by_uniprot(to_id(edge['targetDomainId']))
e = models.ProteinDrugInteraction(pdi_dataset=dataset, drug=drug, protein=protein)
if not update or e.__hash__() not in existing:
bulk.add(e)
if licenced:
if not is_licenced(source):
continue
bulk.add(models.ProteinDrugInteraction(pdi_dataset=get_dataset(source), drug=drug,
protein=protein))
AndiMajore
committed
iter_edge_collection('drug_has_target', add_dpi)
models.ProteinDrugInteraction.objects.bulk_create(bulk)
AndiMajore
committed
new_datasets = [dataset].extend(source_datasets.values())
DatasetLoader.remove_old_pdi_data(new_datasets, licenced)
AndiMajore
committed
def import_protein_protein_interactions(self, dataset: PPIDataset, update):
licenced = dataset.licenced
self.set_licenced(licenced)
self.cache.init_proteins()
bulk = list()
existing = set()
if update:
for edge in models.ProteinProteinInteraction.objects.filter(ppi_dataset=dataset):
existing.add(edge.__hash__())
source_datasets = dict()
source_is_licenced = dict()
def get_dataset(source):
if source not in source_datasets:
source_datasets[source] = DatasetLoader.get_ppi_nedrex_dataset(self.url, licenced, source)
return source_datasets[source]
def is_licenced(source):
if source not in source_is_licenced:
source_is_licenced[source] = DatasetLoader.is_licenced_ppi_source(source)
return source_is_licenced[source]
from nedrex import ppi
offset = 0
limit = 10000
while True:
result = ppi.ppis({"exp"}, skip=offset, limit=limit)
if not result:
return
for edge in result:
eval(edge)
offset += limit
def add_ppi(edge):
try:
protein1 = self.cache.get_protein_by_uniprot(to_id(edge['memberOne']))
protein2 = self.cache.get_protein_by_uniprot(to_id(edge['memberTwo']))
e = models.ProteinProteinInteraction(ppi_dataset=dataset, from_protein=protein1, to_protein=protein2)
if not update or e.__hash__() not in existing:
bulk.append(e)
if licenced:
if not is_licenced(source):
continue
bulk.append(
models.ProteinProteinInteraction(ppi_dataset=get_dataset(source), from_protein=protein1,
to_protein=protein2))
except KeyError:
pass
iter_ppi(add_ppi)
models.ProteinProteinInteraction.objects.bulk_create(bulk)
new_datasets = [dataset, source_datasets.values()]
DatasetLoader.remove_old_ppi_data(new_datasets, licenced)
return len(bulk)
def import_protein_disorder_associations(self, dataset, update):
licenced = dataset.licenced
self.set_licenced(licenced)
self.cache.init_disorders()
self.cache.init_proteins()
bulk = set()
existing = set()
if update:
for edge in models.ProteinDisorderAssociation.objects.filter(pdis_dataset=dataset):
existing.add(edge.__hash__())
source_datasets = dict()
source_is_licenced = dict()
def get_dataset(source):
if source not in source_datasets:
source_datasets[source] = DatasetLoader.get_pdis_nedrex_dataset(self.url, licenced, source)
return source_datasets[source]
def is_licenced(source):
if source not in source_is_licenced:
source_is_licenced[source] = DatasetLoader.is_licenced_pdis_source(source)
return source_is_licenced[source]
def add_pdis(edge):
try:
disorder = self.cache.get_disorder_by_mondo(to_id(edge['targetDomainId']))
for protein in self.cache.get_proteins_by_entrez(to_id(edge['sourceDomainId'])):
e = models.ProteinDisorderAssociation(pdis_dataset=dataset, protein=protein, disorder=disorder,
score=edge['score'])
if not update or e.__hash__() not in existing:
bulk.add(e)
if licenced:
if not is_licenced(source):
continue
bulk.add(
models.ProteinDisorderAssociation(pdis_dataset=get_dataset(source), protein=protein,
disorder=disorder,
score=edge['score']))
except KeyError:
pass
iter_edge_collection('gene_associated_with_disorder', add_pdis)
models.ProteinDisorderAssociation.objects.bulk_create(bulk)
new_datasets = [dataset, source_datasets.values()]
DatasetLoader.remove_old_pdis_data(new_datasets, licenced)
return len(bulk)
def import_drug_disorder_indications(self, dataset, update):
licenced = dataset.licenced
self.set_licenced(licenced)
self.cache.init_disorders()
self.cache.init_drugs()
bulk = set()
existing = set()
if update:
for edge in models.DrugDisorderIndication.objects.filter(drdi_dataset=dataset):
existing.add(edge.__hash__())
source_datasets = dict()
source_is_licenced = dict()
def get_dataset(source):
if source not in source_datasets:
source_datasets[source] = DatasetLoader.get_drdi_nedrex_dataset(self.url, licenced, source)
return source_datasets[source]
def is_licenced(source):
if source not in source_is_licenced:
source_is_licenced[source] = DatasetLoader.is_licenced_drdi_source(source)
return source_is_licenced[source]
drug = self.cache.get_drug_by_drugbank(to_id(edge['sourceDomainId']))
disorder = self.cache.get_disorder_by_mondo(to_id(edge['targetDomainId']))
e = models.DrugDisorderIndication(drdi_dataset=dataset, drug=drug, disorder=disorder)
if not update or e.__hash__() not in existing:
bulk.add(e)
if licenced:
if not is_licenced(source):
continue
bulk.add(
models.DrugDisorderIndication(drdi_dataset=get_dataset(source), drug=drug,
disorder=disorder))
iter_edge_collection('drug_has_indication', add_drdis)
models.DrugDisorderIndication.objects.bulk_create(bulk)
new_datasets = [dataset, source_datasets.values()]
DatasetLoader.remove_old_drdi_data(new_datasets, licenced)