diff --git a/Dockerfile b/Dockerfile index aec457815aabba371b0d508055b54d81a2282366..5bafbf3c8633f19057ec65c128d44827a141e4c6 100755 --- a/Dockerfile +++ b/Dockerfile @@ -22,7 +22,7 @@ RUN pip install -r /usr/src/drugstone/requirements.txt RUN pip install gunicorn COPY ./supervisord.conf /etc/supervisor/conf.d/supervisord.conf -COPY ./docker-entrypoint.sh /usr/src/drugstone/docker-entrypoint.sh +#COPY scripts/docker-entrypoint.sh /usr/src/drugstone/docker-entrypoint.sh # COPY ./scripts/ /usr/src/drugstone/scripts/ COPY ./python_nedrex/ /usr/src/drugstone/python_nedrex/ RUN pip install /usr/src/drugstone/python_nedrex/ diff --git a/data_drugstone/Disorders/disorders.tsv b/data/Disorders/disorders.tsv similarity index 100% rename from data_drugstone/Disorders/disorders.tsv rename to data/Disorders/disorders.tsv diff --git a/data_drugstone/DrDi/drugbank-drug_disorder_indication.tsv b/data/DrDi/drugbank-drug_disorder_indication.tsv similarity index 100% rename from data_drugstone/DrDi/drugbank-drug_disorder_indication.tsv rename to data/DrDi/drugbank-drug_disorder_indication.tsv diff --git a/data_drugstone/Drugs/drug-file.txt b/data/Drugs/drug-file.txt similarity index 100% rename from data_drugstone/Drugs/drug-file.txt rename to data/Drugs/drug-file.txt diff --git a/data_drugstone/Drugs/drugbank_labels.csv b/data/Drugs/drugbank_labels.csv similarity index 100% rename from data_drugstone/Drugs/drugbank_labels.csv rename to data/Drugs/drugbank_labels.csv diff --git a/data_drugstone/Expression/gene_tissue_expression.gct b/data/Expression/gene_tissue_expression.gct similarity index 100% rename from data_drugstone/Expression/gene_tissue_expression.gct rename to data/Expression/gene_tissue_expression.gct diff --git a/data_drugstone/Networks/internal_APID_ChEMBL.gt b/data/Networks/internal_APID_ChEMBL.gt similarity index 100% rename from data_drugstone/Networks/internal_APID_ChEMBL.gt rename to data/Networks/internal_APID_ChEMBL.gt diff --git a/data_drugstone/Networks/internal_APID_DGIdb.gt b/data/Networks/internal_APID_DGIdb.gt similarity index 100% rename from data_drugstone/Networks/internal_APID_DGIdb.gt rename to data/Networks/internal_APID_DGIdb.gt diff --git a/data_drugstone/Networks/internal_APID_DrugBank.gt b/data/Networks/internal_APID_DrugBank.gt similarity index 100% rename from data_drugstone/Networks/internal_APID_DrugBank.gt rename to data/Networks/internal_APID_DrugBank.gt diff --git a/data_drugstone/Networks/internal_BioGRID_ChEMBL.gt b/data/Networks/internal_BioGRID_ChEMBL.gt similarity index 100% rename from data_drugstone/Networks/internal_BioGRID_ChEMBL.gt rename to data/Networks/internal_BioGRID_ChEMBL.gt diff --git a/data_drugstone/Networks/internal_BioGRID_DGIdb.gt b/data/Networks/internal_BioGRID_DGIdb.gt similarity index 100% rename from data_drugstone/Networks/internal_BioGRID_DGIdb.gt rename to data/Networks/internal_BioGRID_DGIdb.gt diff --git a/data_drugstone/Networks/internal_BioGRID_DrugBank.gt b/data/Networks/internal_BioGRID_DrugBank.gt similarity index 100% rename from data_drugstone/Networks/internal_BioGRID_DrugBank.gt rename to data/Networks/internal_BioGRID_DrugBank.gt diff --git a/data_drugstone/Networks/internal_STRING_ChEMBL.gt b/data/Networks/internal_STRING_ChEMBL.gt similarity index 100% rename from data_drugstone/Networks/internal_STRING_ChEMBL.gt rename to data/Networks/internal_STRING_ChEMBL.gt diff --git a/data_drugstone/Networks/internal_STRING_DGIdb.gt b/data/Networks/internal_STRING_DGIdb.gt similarity index 100% rename from data_drugstone/Networks/internal_STRING_DGIdb.gt rename to data/Networks/internal_STRING_DGIdb.gt diff --git a/data_drugstone/Networks/internal_STRING_DrugBank.gt b/data/Networks/internal_STRING_DrugBank.gt similarity index 100% rename from data_drugstone/Networks/internal_STRING_DrugBank.gt rename to data/Networks/internal_STRING_DrugBank.gt diff --git a/data_drugstone/PDI/BIOGRID-CHEMICALS-3.5.187.chemtab.txt b/data/PDI/BIOGRID-CHEMICALS-3.5.187.chemtab.txt similarity index 100% rename from data_drugstone/PDI/BIOGRID-CHEMICALS-3.5.187.chemtab.txt rename to data/PDI/BIOGRID-CHEMICALS-3.5.187.chemtab.txt diff --git a/data_drugstone/PDI/DGIdb_drug_gene_interactions.csv b/data/PDI/DGIdb_drug_gene_interactions.csv similarity index 100% rename from data_drugstone/PDI/DGIdb_drug_gene_interactions.csv rename to data/PDI/DGIdb_drug_gene_interactions.csv diff --git a/data_drugstone/PDI/chembl_drug_gene_interactions.csv b/data/PDI/chembl_drug_gene_interactions.csv similarity index 100% rename from data_drugstone/PDI/chembl_drug_gene_interactions.csv rename to data/PDI/chembl_drug_gene_interactions.csv diff --git a/data_drugstone/PDI/chembl_drug_gene_interactions_uniq.csv b/data/PDI/chembl_drug_gene_interactions_uniq.csv similarity index 100% rename from data_drugstone/PDI/chembl_drug_gene_interactions_uniq.csv rename to data/PDI/chembl_drug_gene_interactions_uniq.csv diff --git a/data_drugstone/PDI/drugbank_drug_gene_interactions.csv b/data/PDI/drugbank_drug_gene_interactions.csv similarity index 100% rename from data_drugstone/PDI/drugbank_drug_gene_interactions.csv rename to data/PDI/drugbank_drug_gene_interactions.csv diff --git a/data_drugstone/PDI/drugbank_drug_gene_interactions_uniq.csv b/data/PDI/drugbank_drug_gene_interactions_uniq.csv similarity index 100% rename from data_drugstone/PDI/drugbank_drug_gene_interactions_uniq.csv rename to data/PDI/drugbank_drug_gene_interactions_uniq.csv diff --git a/data_drugstone/PDi/disgenet-protein_disorder_association.tsv b/data/PDi/disgenet-protein_disorder_association.tsv similarity index 100% rename from data_drugstone/PDi/disgenet-protein_disorder_association.tsv rename to data/PDi/disgenet-protein_disorder_association.tsv diff --git a/data_drugstone/PPDr-temp.graphml b/data/PPDr-temp.graphml similarity index 100% rename from data_drugstone/PPDr-temp.graphml rename to data/PPDr-temp.graphml diff --git a/data_drugstone/PPDr.gt b/data/PPDr.gt similarity index 100% rename from data_drugstone/PPDr.gt rename to data/PPDr.gt diff --git a/data_drugstone/PPI-temp.graphml b/data/PPI-temp.graphml similarity index 100% rename from data_drugstone/PPI-temp.graphml rename to data/PPI-temp.graphml diff --git a/data_drugstone/PPI.gt b/data/PPI.gt similarity index 100% rename from data_drugstone/PPI.gt rename to data/PPI.gt diff --git a/data_drugstone/PPI/BIOGRID-ORGANISM-Homo_sapiens-3.5.187.mitab.txt b/data/PPI/BIOGRID-ORGANISM-Homo_sapiens-3.5.187.mitab.txt similarity index 100% rename from data_drugstone/PPI/BIOGRID-ORGANISM-Homo_sapiens-3.5.187.mitab.txt rename to data/PPI/BIOGRID-ORGANISM-Homo_sapiens-3.5.187.mitab.txt diff --git a/data_drugstone/PPI/apid_9606_Q2.txt b/data/PPI/apid_9606_Q2.txt similarity index 100% rename from data_drugstone/PPI/apid_9606_Q2.txt rename to data/PPI/apid_9606_Q2.txt diff --git a/data_drugstone/PPI/reactome_homo-sapiens-protein-interactions.txt b/data/PPI/reactome_homo-sapiens-protein-interactions.txt similarity index 100% rename from data_drugstone/PPI/reactome_homo-sapiens-protein-interactions.txt rename to data/PPI/reactome_homo-sapiens-protein-interactions.txt diff --git a/data_drugstone/PPI/string_interactions.csv b/data/PPI/string_interactions.csv similarity index 100% rename from data_drugstone/PPI/string_interactions.csv rename to data/PPI/string_interactions.csv diff --git a/data_drugstone/Proteins/entrez_to_ensg.json b/data/Proteins/entrez_to_ensg.json similarity index 100% rename from data_drugstone/Proteins/entrez_to_ensg.json rename to data/Proteins/entrez_to_ensg.json diff --git a/data_drugstone/Proteins/protein_list.csv b/data/Proteins/protein_list.csv similarity index 100% rename from data_drugstone/Proteins/protein_list.csv rename to data/Proteins/protein_list.csv diff --git a/data_drugstone/drug-protein-interaction.txt b/data/drug-protein-interaction.txt similarity index 100% rename from data_drugstone/drug-protein-interaction.txt rename to data/drug-protein-interaction.txt diff --git a/data_drugstone/protein-file.txt b/data/protein-file.txt similarity index 100% rename from data_drugstone/protein-file.txt rename to data/protein-file.txt diff --git a/data_drugstone/protein_protein_interaction_file.txt b/data/protein_protein_interaction_file.txt similarity index 100% rename from data_drugstone/protein_protein_interaction_file.txt rename to data/protein_protein_interaction_file.txt diff --git a/data_drugstone/temp-PPDr.graphml b/data/temp-PPDr.graphml similarity index 100% rename from data_drugstone/temp-PPDr.graphml rename to data/temp-PPDr.graphml diff --git a/docker-compose.yml b/docker-compose.yml index 310e7e96fd948ea10e8c89226afe219fc63237f3..0ea123ad8f598fc938f679cb8c78770f5fd9d2a2 100755 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -6,13 +6,14 @@ services: container_name: drugstone_backend command: - "sh" - - "/usr/src/drugstone/docker-entrypoint.sh" + - "scripts/docker-entrypoint.sh" build: . env_file: - 'docker-django.env.dev' restart: always volumes: - - drugstone_backend_volume:/usr/src/drugstone/drugstone/migrations + - drugstone_db_schema_volume:/usr/src/drugstone/drugstone/migrations + - drugstone_data_volume:/usr/src/drugstone/data ports: - 8001:8000 networks: @@ -30,7 +31,7 @@ services: networks: - drugstone_net volumes: - - drugstone_postgres_volume:/var/lib/postgresql/data_drugstone/ + - drugstone_db_volume:/var/lib/postgresql/data environment: - POSTGRES_DB=drugstone - POSTGRES_USER=drugstone @@ -53,15 +54,13 @@ services: celery: command: - "sh" - - "/usr/src/drugstone/scripts/start_celery_worker.sh" + - "scripts/start_celery_worker.sh" restart: always image: drugstone_backend container_name: drugstone_celery hostname: drugstone_celery env_file: - './docker-django.env.dev' -# volumes: -# - ./:/usr/src/drugstone/ depends_on: - redis - db @@ -70,14 +69,12 @@ services: celery-beat: command: - "sh" - - "/usr/src/drugstone/scripts/start_celery_beat.sh" + - "scripts/start_celery_beat.sh" image: drugstone_backend container_name: drugstone_celery_beat hostname: drugstone_celery_beat env_file: - './docker-django.env.dev' -# volumes: -# - ./:/usr/src/drugstone/ depends_on: - redis - db @@ -97,5 +94,9 @@ networks: drugstone_net: volumes: - drugstone_postgres_volume: - drugstone_backend_volume: \ No newline at end of file + drugstone_db_volume: + external: true + drugstone_db_schema_volume: + external: true + drugstone_data_volume: + external: true \ No newline at end of file diff --git a/drugstone/backend_tasks.py b/drugstone/backend_tasks.py index e8a61344fa58500ce4d4fe8f4c0f2bbf84a82928..01f7827354a707fa62ff2917e39f64d8911c6dd8 100755 --- a/drugstone/backend_tasks.py +++ b/drugstone/backend_tasks.py @@ -40,7 +40,7 @@ def run_task(token, algorithm, parameters): r.set(f'{token}_job_id', f'{job_id}') r.set(f'{token}_started_at', str(datetime.now().timestamp())) - task_hook = TaskHook(json.loads(parameters), './data_drugstone/Networks/', set_progress, set_result) + task_hook = TaskHook(json.loads(parameters), './data/Networks/', set_progress, set_result) try: if algorithm == 'dummy': diff --git a/drugstone/management/commands/import_from_nedrex.py b/drugstone/management/commands/import_from_nedrex.py index dc3f052cfbc6fa42e2df39fb36c4a94aa520275f..b3af226edb95f72395502c6fbae705dbef62b3c1 100644 --- a/drugstone/management/commands/import_from_nedrex.py +++ b/drugstone/management/commands/import_from_nedrex.py @@ -4,6 +4,7 @@ import python_nedrex as nedrex from python_nedrex.core import get_nodes, get_edges, get_api_key from drugstone import models +from drugstone.management.includes.NodeCache import NodeCache def iter_node_collection(coll_name, eval): @@ -35,7 +36,7 @@ def identify_updates(new_list, old_list): c = list() for id in new_list: if id not in old_list: - c.append(id) + c.append(new_list[id]) elif new_list[id] != old_list[id]: old_list[id].update(new_list[id]) u.append(old_list[id]) @@ -45,51 +46,33 @@ def identify_updates(new_list, old_list): def format_list(l): if l is not None and len(l) > 0: s = str(l)[1:] - return s[:len(s) - 1].replace("'","") + return s[:len(s) - 1].replace("'", "") return "" -class nedrex_importer: - proteins = dict() - entrez_to_uniprot = dict() - gene_name_to_uniprot = defaultdict(lambda: set()) - disorders = dict() - drugs = dict() +def to_id(string): + idx = string.index('.') + return string[idx + 1:] - def __init__(self, base_url): + +class NedrexImporter: + cache: NodeCache = None + + def __init__(self, base_url, cache: NodeCache): + self.cache = cache nedrex.config.set_url_base(base_url) api_key = get_api_key(accept_eula=True) nedrex.config.set_api_key(api_key) - def init_proteins(self): - if len(self.proteins) == 0: - print("Generating protein maps...") - for protein in models.Protein.objects.all(): - self.proteins[protein.entrez] = protein - self.entrez_to_uniprot[protein.entrez] = protein.uniprot_code - self.gene_name_to_uniprot[protein.gene].add(protein.uniprot_code) - - def init_drugs(self): - if len(self.drugs) == 0: - print("Generating drug map...") - for drug in models.Drug.objects.all(): - self.drugs[drug.drug_id] = drug - - def init_disorders(self): - if len(self.disorders) == 0: - print("Generating disorder map...") - for disorder in models.Disorder.objects.all(): - self.disorders[disorder.mondo_id] = disorder - def import_proteins(self, update: bool): proteins = dict() gene_to_prots = defaultdict(lambda: set()) if update: - self.init_proteins() + self.cache.init_proteins() def add_protein(node): - id = node['primaryDomainId'].split('.')[1] + id = to_id(node['primaryDomainId']) name = node['geneName'] if len(node['synonyms']) > 0: name = node['synonyms'][0] @@ -100,13 +83,13 @@ class nedrex_importer: proteins[id] = models.Protein(uniprot_code=id, protein_name=name, gene=node['geneName']) def add_edges(edge): - id = edge['sourceDomainId'].split('.')[1] + id = to_id(edge['sourceDomainId']) protein = proteins[id] - protein.entrez = edge['targetDomainId'].split('.')[1] + protein.entrez = to_id(edge['targetDomainId']) gene_to_prots[protein.entrez].add(id) def add_genes(node): - id = node['primaryDomainId'].split('.')[1] + id = to_id(node['primaryDomainId']) for prot_id in gene_to_prots[id]: protein = proteins[prot_id] try: @@ -116,65 +99,177 @@ class nedrex_importer: iter_node_collection('protein', add_protein) iter_edge_collection('protein_encoded_by_gene', add_edges) + + with_entrez = dict() + for ids in gene_to_prots.values(): + for id in ids: + with_entrez[id] = proteins[id] + proteins = with_entrez + iter_node_collection('gene', add_genes) # TODO test updating ideas + if update: - (updates, creates) = identify_updates(proteins, self.proteins) - models.Protein.objects.bulk_update(updates) + (updates, creates) = identify_updates(proteins, self.cache.proteins) + for u in updates: + u.save() models.Protein.objects.bulk_create(creates) for protein in creates: - self.proteins[protein.uniprot_code] = protein + self.cache.proteins[protein.uniprot_code] = protein else: models.Protein.objects.bulk_create(proteins.values()) - self.proteins = proteins - return len(self.proteins) + self.cache.proteins = proteins + return len(self.cache.proteins) def import_drugs(self, update): drugs = dict() if update: - self.init_drugs() + self.cache.init_drugs() def add_drug(node): - id = node['primaryDomainId'].split('.')[1] + id = to_id(node['primaryDomainId']) drugs[id] = models.Drug(drug_id=id, name=node['displayName'], status=format_list(node['drugGroups'])) iter_node_collection('drug', add_drug) # TODO test updating ideas if update: - (updates, creates) = identify_updates(drugs, self.drugs) - models.Drug.objects.bulk_update(updates) + (updates, creates) = identify_updates(drugs, self.cache.drugs) + for u in updates: + u.save() models.Drug.objects.bulk_create(creates) for drug in creates: - self.drugs[drug.drug_id] = drug + self.cache.drugs[drug.drug_id] = drug else: models.Drug.objects.bulk_create(drugs.values()) - self.drugs = drugs + self.cache.drugs = drugs - return len(self.drugs) + return len(self.cache.drugs) def import_disorders(self, update): disorders = dict() if update: - self.init_disorders() + self.cache.init_disorders() def add_disorder(node): - id = node['primaryDomainId'].split('.')[1] + id = to_id(node['primaryDomainId']) disorders[id] = models.Disorder(mondo_id=id, label=node['displayName'], icd10=format_list(node['icd10'])) iter_node_collection('disorder', add_disorder) # TODO test updating ideas if update: - (updates, creates) = identify_updates(disorders, self.disorders) - models.Disorder.objects.bulk_update(updates) + (updates, creates) = identify_updates(disorders, self.cache.disorders) + for u in updates: + u.save() models.Disorder.objects.bulk_create(creates) for disorder in creates: - self.disorders[disorder.uniprot_code] = disorder + self.cache.disorders[disorder.mondo_id] = disorder else: models.Disorder.objects.bulk_create(disorders.values()) - self.disorders = disorders + self.cache.disorders = disorders + + return len(self.cache.disorders) + + def import_drug_target_interactions(self, dataset, update): + self.cache.init_drugs() + self.cache.init_proteins() + + if update: + models.ProteinDrugInteraction.objects.filter(pdi_dataset=dataset).delete() + + bulk = set() - return len(self.disorders) + def add_dpi(edge): + try: + bulk.add(models.ProteinDrugInteraction(pdi_dataset=dataset, + drug=self.cache.get_drug_by_drugbank( + to_id(edge['sourceDomainId'])), + protein=self.cache.get_protein_by_uniprot( + to_id(edge['targetDomainId'])))) + except KeyError: + pass + iter_edge_collection('drug_has_target', add_dpi) + models.ProteinDrugInteraction.objects.bulk_create(bulk) + return len(bulk) + def import_protein_protein_interactions(self, dataset, update): + self.cache.init_proteins() + + if update: + models.ProteinProteinInteraction.objects.filter(ppi_dataset=dataset).delete() + + bulk = list() + + def iter_ppi(eval): + from python_nedrex import ppi + offset = 0 + limit = 10000 + while True: + result = ppi.ppis({"exp"}, skip=offset, limit=limit) + if not result: + return + for edge in result: + eval(edge) + offset += limit + + def add_ppi(edge): + try: + bulk.append(models.ProteinProteinInteraction(ppi_dataset=dataset, + from_protein=self.cache.get_protein_by_uniprot( + to_id(edge['memberOne'])), + to_protein=self.cache.get_protein_by_uniprot( + to_id(edge['memberTwo'])))) + except KeyError: + pass + + iter_ppi(add_ppi) + models.ProteinProteinInteraction.objects.bulk_create(bulk) + return len(bulk) + + def import_protein_disorder_associations(self, dataset, update): + self.cache.init_disorders() + self.cache.init_proteins() + + if update: + models.ProteinDisorderAssociation.objects.filter(pdis_dataset=dataset).delete() + + bulk = set() + + def add_pdis(edge): + try: + disorder = self.cache.get_disorder_by_mondo(to_id(edge['targetDomainId'])) + for protein in self.cache.get_proteins_by_entrez(to_id(edge['sourceDomainId'])): + bulk.add(models.ProteinDisorderAssociation(pdis_dataset=dataset, + protein=protein, + disorder=disorder, score=edge['score'])) + except KeyError: + pass + + iter_edge_collection('gene_associated_with_disorder', add_pdis) + models.ProteinDisorderAssociation.objects.bulk_create(bulk) + return len(bulk) + + def import_drug_disorder_indications(self, dataset, update): + self.cache.init_disorders() + self.cache.init_drugs() + + if update: + models.DrugDisorderIndication.objects.filter(drdi_dataset=dataset).delete() + + bulk = set() + + def add_drdis(edge): + try: + bulk.add(models.DrugDisorderIndication(drdi_dataset=dataset, + drug=self.cache.get_drug_by_drugbank( + to_id(edge['sourceDomainId'])), + disorder=self.cache.get_disorder_by_mondo( + to_id(edge['targetDomainId'])))) + except KeyError: + pass + + iter_edge_collection('drug_has_indication', add_drdis) + models.DrugDisorderIndication.objects.bulk_create(bulk) + return len(bulk) diff --git a/drugstone/management/commands/make_graphs.py b/drugstone/management/commands/make_graphs.py index 168c8370bd8c3c3dbdf8c32cd70cbabf91a131d2..a6c3d815fe20d0a355a93f133aea4bd0fd29ec6c 100755 --- a/drugstone/management/commands/make_graphs.py +++ b/drugstone/management/commands/make_graphs.py @@ -80,7 +80,7 @@ def _internal_ppis(dataset_name: str) -> List[dict]: def create_gt(params: Tuple[str, str]) -> None: """Fetches all required information to build a graph-tools file for given PPI and PDI dataset names (params). Builds the graph-tools file and saves it in - the data_drugstone/Networks folder. + the data/Networks folder. Args: params (Tuple[str, str]): Protein-protein-dataset name, Protein-drug-dataset name @@ -176,7 +176,7 @@ def create_gt(params: Tuple[str, str]) -> None: print("done with drug edges") # save graph - filename = f"./data_drugstone/Networks/internal_{ppi_dataset}_{pdi_dataset}.gt" + filename = f"./data/Networks/internal_{ppi_dataset}_{pdi_dataset}.gt" g.save(filename) print(f"Created file {filename}") return diff --git a/drugstone/management/commands/populate_db.py b/drugstone/management/commands/populate_db.py index 00f3d7ca50eb22ad3ee40ceba11238853fed5edf..66d878f9d97188eb4c2275f0873451d27f56e2d1 100755 --- a/drugstone/management/commands/populate_db.py +++ b/drugstone/management/commands/populate_db.py @@ -1,28 +1,20 @@ from django.core.management.base import BaseCommand -import pandas as pd -from django.db import OperationalError, IntegrityError +from django.db import OperationalError -from drugstone.models import Protein, Drug, Tissue, ExpressionLevel, PPIDataset, PDIDataset, Disorder, PDisDataset, DrDiDataset -from drugstone.models import ProteinProteinInteraction, ProteinDrugInteraction +from drugstone.models import Protein, Drug, Tissue, ExpressionLevel, PPIDataset, PDIDataset, Disorder, PDisDataset, \ + DrDiDataset, EnsemblGene +from drugstone.models import ProteinProteinInteraction, ProteinDrugInteraction, ProteinDisorderAssociation, \ + DrugDisorderIndication from drugstone.management.includes.DataPopulator import DataPopulator -from .import_from_nedrex import nedrex_importer +from .import_from_nedrex import NedrexImporter +from drugstone.management.includes.NodeCache import NodeCache +from drugstone.management.includes import DatasetLoader class DatabasePopulator: - def __init__(self, data_dir, - # protein_file, - # drug_file, - # protein_protein_interaction_file, - # protein_drug_interaction_file, - tissue_expression_file, - ): + def __init__(self, data_dir): self.data_dir = data_dir - # self.protein_file = protein_file - # self.drug_file = drug_file - # self.ppi_file = protein_protein_interaction_file - # self.pdi_file = protein_drug_interaction_file - self.exp_file = tissue_expression_file def delete_model(self, model): from django.db import connection @@ -32,31 +24,37 @@ class DatabasePopulator: except OperationalError: cursor.execute('DELETE FROM "{0}"'.format(model._meta.db_table)) + def delete_all(self): + models = ['PPI', 'PDI', 'DrDi', 'Protein', 'Drug', 'Disorder', 'PDi', 'Expression', 'Tissue'] + self.delete_models(models) + def delete_models(self, model_list): for model_name in model_list: print(f'Deleting {model_name} model ...') if model_name == 'PPI': + self.delete_model(PPIDataset) self.delete_model(ProteinProteinInteraction) elif model_name == 'PDI': + self.delete_model(PDIDataset) self.delete_model(ProteinDrugInteraction) + elif model_name == 'DrDi': + self.delete_model(DrDiDataset) + self.delete_model(DrugDisorderIndication) elif model_name == 'Protein': self.delete_model(Protein) + self.delete_model(EnsemblGene) elif model_name == 'Drug': self.delete_model(Drug) elif model_name == 'Disorder': self.delete_model(Disorder) - elif model_name == 'PDiAssociations': + elif model_name == 'PDi': self.delete_model(PDisDataset) + self.delete_model(ProteinDisorderAssociation) + elif model_name == 'Expression': + self.delete_model(ExpressionLevel) elif model_name == 'Tissue': self.delete_model(Tissue) - elif model_name == 'PPIDataset': - self.delete_model(PPIDataset) - elif model_name == 'PDIDataset': - self.delete_model(PDIDataset) - elif model_name == 'DrDiDataset': - self.delete_model(DrDiDataset) - class Command(BaseCommand): @@ -65,104 +63,126 @@ class Command(BaseCommand): # dataset directory parser.add_argument('-dd', '--data_dir', type=str, help='Dataset directory path') parser.add_argument('-dm', '--delete_model', type=str, help='Delete model(s)') + parser.add_argument('-c', '--clear', action='store_true', help='Delete all models') + parser.add_argument('-a', '--all', action='store_true', help='Populate all tables') + parser.add_argument('-u', '--update', action='store_true', help='Execute database update for selected tables') parser.add_argument('-p', '--proteins', action='store_true', help='Populate Proteins') parser.add_argument('-di', '--disorders', action='store_true', help='Populate Disorders') parser.add_argument('-dr', '--drugs', action='store_true', help='Drug file name') - parser.add_argument('-exp', '--exp_file', type=str, help='Tissue expression file (.gct without first 2 lines)') + parser.add_argument('-exp', '--exp', action='store_true', + help='Tissue expression file (.gct without first 2 lines)') - parser.add_argument('-pp', '--protein_protein', type=str, help='Populate Protein-Protein Interactions') - parser.add_argument('-pdr', '--protein_drug', type=str, help='Populate Protein-Drug Interactions') - parser.add_argument('-pdi', '--protein_disorder', type=str, help='Populate Protein-Disorder Associations') - parser.add_argument('-ddi', '--drug_disorder', type=str, help='Populate Drug-Disorder Indications') + parser.add_argument('-pp', '--protein_protein', action='store_true', + help='Populate Protein-Protein Interactions') + parser.add_argument('-pdr', '--protein_drug', action='store_true', help='Populate Protein-Drug Interactions') + parser.add_argument('-pdi', '--protein_disorder', action='store_true', + help='Populate Protein-Disorder Associations') + parser.add_argument('-ddi', '--drug_disorder', action='store_true', help='Populate Drug-Disorder Indications') def handle(self, *args, **kwargs): - + nedrex_api_url = "http://82.148.225.92:8123/" data_dir = kwargs['data_dir'] - exp_file = kwargs['exp_file'] - - # p = kwargs['proteins'] - # pp = kwargs['protein_protein'] - # pd = kwargs['protein_drug'] + db_populator = DatabasePopulator(data_dir=data_dir) - db_populator = DatabasePopulator(data_dir=data_dir, - # protein_file=protein_file, - # drug_file=drug_file, - # protein_protein_interaction_file=ppi_file, - # protein_drug_interaction_file=pdi_file, - tissue_expression_file=exp_file, - ) - - importer = nedrex_importer("http://82.148.225.92:8123/") + if kwargs['clear']: + db_populator.delete_all() if kwargs['delete_model'] is not None: model_list = kwargs['delete_model'].split(',') db_populator.delete_models(model_list) - return - populator = DataPopulator() + cache = NodeCache() + update = True if kwargs['update'] else False + importer = NedrexImporter(nedrex_api_url, cache) + populator = DataPopulator(cache) + + if kwargs['all']: + kwargs['drugs'] = True + kwargs['disorders'] = True + kwargs['proteins'] = True + kwargs['exp'] = True + kwargs['protein_protein'] = True + kwargs['protein_drug'] = True + kwargs['protein_disorder'] = True + kwargs['drug_disorder'] = True if kwargs['drugs']: print('Populating Drugs...') - # n = DataPopulator.populate_drugs(populator) - n = nedrex_importer.import_drugs(importer,False) + n = NedrexImporter.import_drugs(importer, update) print(f'Populated {n} Drugs.') - - if kwargs['exp_file'] is not None: - print('Populating Expressions...') - n = DataPopulator.populate_expessions(populator) - print(f'Populated {n} Expressions.') + if kwargs['disorders']: + print('Populating Disorders...') + n = NedrexImporter.import_disorders(importer, update) + print(f'Populated {n} Disorders.') if kwargs['proteins']: print('Populating Proteins...') - - n = nedrex_importer.import_proteins(importer, False) - # n = DataPopulator.populate_proteins(populator) + n = NedrexImporter.import_proteins(importer, update) print(f'Populated {n} Proteins.') - - # print('Populating ENSG IDs...') - # n = DataPopulator.populate_ensg(populator) - # print(f'Populated {n} ENSG IDs.') + print('Populating ENSG IDs...') + n = DataPopulator.populate_ensg(populator,update) + print(f'Populated {n} ENSG IDs.') - if kwargs['disorders']: - print('Populating Disorders...') - n = nedrex_importer.import_disorders(importer, False) - # n = DataPopulator.populate_disorders(populator) - print(f'Populated {n} Disorders.') + if kwargs['exp']: + print('Populating Expressions...') + n = DataPopulator.populate_expressions(populator, update) + print(f'Populated {n} Expressions.') - if kwargs['protein_protein'] is not None: + if kwargs['protein_protein']: + print('Importing PPIs from NeDRexDB...') + n = NedrexImporter.import_protein_protein_interactions(importer, + DatasetLoader.get_ppi_nedrex(nedrex_api_url), + update) + print(f'Imported {n} PPIs from NeDRexDB') print('Populating PPIs from STRING...') - n = DataPopulator.populate_ppi_string(populator) + n = DataPopulator.populate_ppi_string(populator, DatasetLoader.get_ppi_string(), update) print(f'Populated {n} PPIs from STRING.') print('Populating PPIs from APID...') - n = DataPopulator.populate_ppi_apid(populator) + n = DataPopulator.populate_ppi_apid(populator, DatasetLoader.get_ppi_apid(), update) print(f'Populated {n} PPIs from APID.') print('Populating PPIs from BioGRID...') - n = DataPopulator.populate_ppi_biogrid(populator) + n = DataPopulator.populate_ppi_biogrid(populator, DatasetLoader.get_ppi_biogrid(), update) print(f'Populated {n} PPIs from BioGRID.') - if kwargs['protein_drug'] is not None: + if kwargs['protein_drug']: + print('Importing PDIs from NeDRexDB...') + n = NedrexImporter.import_drug_target_interactions(importer, DatasetLoader.get_drug_target_nedrex(nedrex_api_url), update) + print(f'Imported {n} PDIs from NeDRexDB') + print('Populating PDIs from Chembl...') - n = DataPopulator.populate_pdi_chembl(populator) + n = DataPopulator.populate_pdi_chembl(populator,DatasetLoader.get_drug_target_chembl(), update) print(f'Populated {n} PDIs from Chembl.') print('Populating PDIs from DGIdb...') - n = DataPopulator.populate_pdi_dgidb(populator) + n = DataPopulator.populate_pdi_dgidb(populator, DatasetLoader.get_drug_target_dgidb(), update) print(f'Populated {n} PDIs from DGIdb.') print('Populating PDIs from DrugBank...') - n = DataPopulator.populate_pdi_drugbank(populator) + n = DataPopulator.populate_pdi_drugbank(populator, DatasetLoader.get_drug_target_drugbank(), update) print(f'Populated {n} PDIs from DrugBank.') - if kwargs['protein_disorder'] is not None: + + if kwargs['protein_disorder']: + print('Importing PDis from NeDRexDB...') + n = NedrexImporter.import_protein_disorder_associations(importer, + DatasetLoader.get_protein_disorder_nedrex(nedrex_api_url), + update) + print(f'Imported {n} PDis from NeDRexDB') print('Populating PDis associations from DisGeNET...') - n=DataPopulator.populate_pdis_disgenet(populator) + n = DataPopulator.populate_pdis_disgenet(populator, DatasetLoader.get_disorder_protein_disgenet(), update) print(f'Populated {n} PDis associations from DisGeNET.') - if kwargs['drug_disorder'] is not None: + + if kwargs['drug_disorder']: + print('Importing DrDis from NeDRexDB...') + n = NedrexImporter.import_drug_disorder_indications(importer, + DatasetLoader.get_drug_disorder_nedrex(nedrex_api_url), + update) + print(f'Imported {n} DrDis from NeDRexDB') print('Populating DrDi indications from DrugBank...') - n=DataPopulator.populate_drdis_drugbank(populator) + n = DataPopulator.populate_drdis_drugbank(populator, DatasetLoader.get_drug_disorder_drugbank(), update) print(f'Populated {n} DrDi associations from DrugBank.') diff --git a/drugstone/management/commands/test.py b/drugstone/management/commands/test.py index 5afbea249d2d65f1cf5efad9476faf9c78821a9e..4d01dd45c09e844124c1d244516d543bda8d9c58 100644 --- a/drugstone/management/commands/test.py +++ b/drugstone/management/commands/test.py @@ -1,5 +1,6 @@ import python_nedrex as nedrex from python_nedrex.core import get_nodes, get_edges, get_api_key +from python_nedrex.static import get_metadata def iter_node_collection(coll_name, eval): offset = 0 @@ -25,9 +26,26 @@ def iter_edge_collection(coll_name, eval): offset += limit +def iter_ppi(eval): + from python_nedrex import ppi + offset = 0 + limit = 1000 + while True: + result = ppi.ppis({"exp"},skip = offset, limit=limit) + if not result: + return + for edge in result: + eval(edge) + offset += limit + base_url = "http://82.148.225.92:8123/" nedrex.config.set_url_base(base_url) api_key = get_api_key(accept_eula=True) nedrex.config.set_api_key(api_key) +print(f'Nodes: {nedrex.core.get_node_types()}') +print(f'Edges: {nedrex.core.get_edge_types()}') +print(f'{get_metadata()}') + -iter_edge_collection("gene_expressed_in_tissue", lambda node: {print(node)}) \ No newline at end of file +iter_ppi(lambda node: print(node)) +# iter_edge_collection("gene_expressed_in_tissue", lambda node: {print(node)}) \ No newline at end of file diff --git a/drugstone/management/includes/DataLoader.py b/drugstone/management/includes/DataLoader.py index 8fa0d2a56c0710d228a1581d5c7565d6a6e33847..ca31b567951f8493117ae595d60e56a124442f0b 100755 --- a/drugstone/management/includes/DataLoader.py +++ b/drugstone/management/includes/DataLoader.py @@ -3,14 +3,14 @@ import json class DataLoader: - PATH_PROTEINS = 'data_drugstone/Proteins/' - PATH_DRUGS = 'data_drugstone/Drugs/' - PATH_EXPR = 'data_drugstone/' - PATH_DISORDERS = 'data_drugstone/Disorders/' - PATH_PDI = 'data_drugstone/PDI/' - PATH_PPI = 'data_drugstone/PPI/' - PATH_PDi = 'data_drugstone/PDi/' - PATH_DDi = 'data_drugstone/DrDi/' + PATH_PROTEINS = 'data/Proteins/' + PATH_DRUGS = 'data/Drugs/' + PATH_EXPR = 'data/Expression/' + PATH_DISORDERS = 'data/Disorders/' + PATH_PDI = 'data/PDI/' + PATH_PPI = 'data/PPI/' + PATH_PDi = 'data/PDi/' + PATH_DDi = 'data/DrDi/' # Proteins PROTEINS_COVEX = 'protein_list.csv' @@ -230,7 +230,7 @@ class DataLoader: Returns: pd.DataFrame: columns "protein_name", "disorder_name" and "score" """ - return pd.read_csv(f'{DataLoader.PATH_PDi}{DataLoader.PDi_DISGENET}', sep='\t') + return pd.read_csv(f'{DataLoader.PATH_PDi}{DataLoader.PDi_DISGENET}', sep='\t', dtype={'disorder_name':str, 'protein_name':str, 'score':float}) @staticmethod def load_drdis_drugbank() -> pd.DataFrame: @@ -239,7 +239,7 @@ class DataLoader: Returns: pd.DataFrame: columns "drugbank_id" and "mondo_id" """ - return pd.read_csv(f'{DataLoader.PATH_DDi}{DataLoader.DDi_DRUGBANK}', sep='\t') + return pd.read_csv(f'{DataLoader.PATH_DDi}{DataLoader.DDi_DRUGBANK}', sep='\t', dtype={'drugbank_id':str, 'mondo_id':str}) @staticmethod def load_pdi_dgidb() -> pd.DataFrame: diff --git a/drugstone/management/includes/DataPopulator.py b/drugstone/management/includes/DataPopulator.py index 40562323890de411e4cd27904d631bfc013f5165..5858202f82fd095b0c879e423a22a0f973015c5f 100755 --- a/drugstone/management/includes/DataPopulator.py +++ b/drugstone/management/includes/DataPopulator.py @@ -1,88 +1,18 @@ -from collections import defaultdict - from drugstone.management.includes.DataLoader import DataLoader import drugstone.models as models +from drugstone.management.includes.NodeCache import NodeCache class DataPopulator: - proteins = dict() - uniprot_to_ensembl = dict() - gene_name_to_ensembl = defaultdict(lambda: set()) - disorders = dict() - drugs = dict() - - def init_proteins(self): - if len(self.proteins) == 0: - print("Generating protein maps...") - for protein in models.Protein.objects.all(): - self.proteins[protein.entrez]=protein - self.uniprot_to_ensembl[protein.uniprot_code] = protein.entrez - self.gene_name_to_ensembl[protein.gene].add(protein.entrez) - - def init_drugs(self): - if len(self.drugs)== 0: - print("Generating drug map...") - for drug in models.Drug.objects.all(): - self.drugs[drug.drug_id]=drug - - def init_disorders(self): - if len(self.disorders) == 0: - print("Generating disorder map...") - for disorder in models.Disorder.objects.all(): - self.disorders[disorder.mondo_id]=disorder - - # def populate_proteins(self) -> int: - # """ Populates the Protein table in the django database. - # Handles loading the data and passing it to the django database - # - # Returns: - # int: Count of how many proteins were added - # """ - # df = DataLoader.load_proteins() - # for _, row in df.iterrows(): - # self.proteins[row['entrez_id']] = models.Protein( - # uniprot_code=row['protein_ac'], - # gene=row['gene_name'], - # entrez=row['entrez_id'], - # protein_name=row['protein_name']) - # self.uniprot_to_ensembl[row['protein_ac']] = row['entrez_id'] - # self.gene_name_to_ensembl[row['gene_name']].add(row['entrez_id']) - # - # models.Protein.objects.bulk_create(self.proteins.values()) - # return len(self.proteins) - # - # def populate_disorders(self) -> int: - # """ Populates the Disorder table in the django database. - # Handles loading the data and passing it to the django database - # - # Returns: - # int: Count of how many disorders were added - # """ - # df = DataLoader.load_disorders() - # for _, row in df.iterrows(): - # self.disorders[row['mondo_id']] = models.Disorder( - # mondo_id=row['mondo_id'], - # label=row['label'], - # icd10=row['icd10'] - # ) - # models.Disorder.objects.bulk_create(self.disorders.values()) - # return len(self.disorders) - # - # def populate_drugs(self): - # df = DataLoader.load_drugs() - # for _, row in df.iterrows(): - # drug_id = row['drug_id'] - # drug_name = row['drug_name'] - # drug_status = row['drug_status'] - # self.drugs[drug_id] = models.Drug( - # drug_id=drug_id, - # name=drug_name, - # status=drug_status) - # models.Drug.objects.bulk_create(self.drugs.values()) - # return len(self.drugs) - - def populate_expessions(self): - self.init_proteins() + + def __init__(self, cache: NodeCache): + self.cache = cache + + def populate_expressions(self, update): + if update: + models.ExpressionLevel.objects.all().delete() + + self.cache.init_proteins() df = DataLoader.load_expressions() tissues_models = dict() @@ -94,28 +24,34 @@ class DataPopulator: tissues_models[tissue_name] = tissue_model proteins_linked = 0 - unique = set() - bulk = list() + bulk = set() + uniq = set() + size = 0 for _, row in df.iterrows(): gene_name = row['Description'] - for protein_id in self.gene_name_to_ensembl[gene_name]: - protein_model = self.proteins[protein_id] + for protein_model in self.cache.get_proteins_by_gene(gene_name): proteins_linked += 1 for tissue_name, tissue_model in tissues_models.items(): - id = f"{tissue_name}_{protein_id}" - if id in unique: + expr = models.ExpressionLevel(protein=protein_model, + tissue=tissue_model, + expression_level=row[tissue_name]) + id = hash(expr) + if id in uniq: continue - unique.add(id) - bulk.append(models.ExpressionLevel(protein=protein_model, - tissue=tissue_model, - expression_level=row[tissue_name])) + uniq.add(id) + bulk.add(expr) + if len(bulk) > 100000: + models.ExpressionLevel.objects.bulk_create(bulk) + size += len(bulk) + bulk = set() + models.ExpressionLevel.objects.bulk_create(bulk) - return len(bulk) + return size + len(bulk) - def populate_ensg(self) -> int: + def populate_ensg(self,update) -> int: """ Populates the Ensembl-Gene table in the django database. Also maps the added ensg entries to the corresponding proteins. Handles loading the data and passing it to the django database @@ -123,76 +59,78 @@ class DataPopulator: Returns: int: Count of how many ensg-protein relations were added """ - self.init_proteins() + if update: + models.EnsemblGene.objects.all().delete() + self.cache.init_proteins() data = DataLoader.load_ensg() bulk = list() + for entrez, ensg_list in data.items(): - protein = self.proteins[entrez] - for ensg in ensg_list: - bulk.append(models.EnsemblGene(name=ensg, protein=protein)) + proteins = self.cache.get_proteins_by_entrez(entrez) + for protein in proteins: + for ensg in ensg_list: + bulk.append(models.EnsemblGene(name=ensg, protein=protein)) models.EnsemblGene.objects.bulk_create(bulk) return len(bulk) - def populate_ppi_string(self) -> int: + def populate_ppi_string(self, dataset, update) -> int: """ Populates the Protein-Protein-Interactions from STRINGdb Handles loading the data and passing it to the django database Returns: int: Count of how many interactions were added """ - self.init_proteins() + self.cache.init_proteins() + if update: + models.ProteinProteinInteraction.objects.filter(ppi_dataset=dataset).delete() + df = DataLoader.load_ppi_string() - dataset, _ = models.PPIDataset.objects.get_or_create( - name='STRING', - link='https://string-db.org/', - version='11.0' - ) bulk = list() for _, row in df.iterrows(): try: # try fetching proteins - protein_a = self.proteins[row['entrez_a']] - protein_b = self.proteins[row['entrez_b']] + proteins_a = self.cache.get_proteins_by_entrez(row['entrez_a']) + proteins_b = self.cache.get_proteins_by_entrez(row['entrez_b']) except KeyError: # continue if not found continue - try: - bulk.append(models.ProteinProteinInteraction( - ppi_dataset=dataset, - from_protein=protein_a, - to_protein=protein_b - )) - except models.ValidationError: - # duplicate - continue + for protein_a in proteins_a: + for protein_b in proteins_b: + try: + bulk.append(models.ProteinProteinInteraction( + ppi_dataset=dataset, + from_protein=protein_a, + to_protein=protein_b + )) + except models.ValidationError: + # duplicate + continue models.ProteinProteinInteraction.objects.bulk_create(bulk) return len(bulk) - def populate_ppi_apid(self) -> int: + def populate_ppi_apid(self, dataset, update) -> int: """ Populates the Protein-Protein-Interactions from Apid Handles loading the data and passing it to the django database Returns: int: Count of how many interactions were added """ - self.init_proteins() + self.cache.init_proteins() + + if update: + models.ProteinProteinInteraction.objects.filter(ppi_dataset=dataset).delete() df = DataLoader.load_ppi_apid() - dataset, _ = models.PPIDataset.objects.get_or_create( - name='APID', - link='http://cicblade.dep.usal.es:8080/APID/', - version='January 2019' - ) - bulk = list() + bulk = set() for _, row in df.iterrows(): try: # try fetching proteins - protein_a = self.proteins[self.uniprot_to_ensembl[row['from_protein_ac']]] - protein_b = self.proteins[self.uniprot_to_ensembl[row['to_protein_ac']]] + protein_a = self.cache.get_protein_by_uniprot(row['from_protein_ac']) + protein_b = self.cache.get_protein_by_uniprot(row['to_protein_ac']) except KeyError: # continue if not found continue try: - bulk.append(models.ProteinProteinInteraction( + bulk.add(models.ProteinProteinInteraction( ppi_dataset=dataset, from_protein=protein_a, to_protein=protein_b @@ -202,71 +140,69 @@ class DataPopulator: models.ProteinProteinInteraction.objects.bulk_create(bulk) return len(bulk) - def populate_ppi_biogrid(self) -> int: + def populate_ppi_biogrid(self,dataset, update) -> int: """ Populates the Protein-Protein-Interactions from BioGRID Handles loading the data and passing it to the django database Returns: int: Count of how many interactions were added """ - self.init_proteins() + self.cache.init_proteins() + + if update: + models.ProteinProteinInteraction.objects.filter(ppi_dataset=dataset).delete() df = DataLoader.load_ppi_biogrid() - dataset, _ = models.PPIDataset.objects.get_or_create( - name='BioGRID', - link='https://thebiogrid.org/', - version='4.0' - ) bulk = list() for _, row in df.iterrows(): try: # try fetching proteins - protein_a = self.proteins[row['entrez_a']] - protein_b = self.proteins[row['entrez_b']] + proteins_a = self.cache.get_proteins_by_entrez(row['entrez_a']) + proteins_b = self.cache.get_proteins_by_entrez(row['entrez_b']) except KeyError: # TODO update error # continue if not found continue - try: - bulk.append(models.ProteinProteinInteraction( - ppi_dataset=dataset, - from_protein=protein_a, - to_protein=protein_b - )) - except models.ValidationError: - # duplicate - continue + for protein_a in proteins_a: + for protein_b in proteins_b: + try: + bulk.append(models.ProteinProteinInteraction( + ppi_dataset=dataset, + from_protein=protein_a, + to_protein=protein_b + )) + except models.ValidationError: + # duplicate + continue models.ProteinProteinInteraction.objects.bulk_create(bulk) return len(bulk) - def populate_pdi_chembl(self) -> int: + def populate_pdi_chembl(self,dataset, update) -> int: """ Populates the Protein-Drug-Interactions from Chembl Handles Loading the data and passing it to the django database Returns: int: Count of how many interactions were added """ - self.init_proteins() - self.init_drugs() + self.cache.init_proteins() + self.cache.init_drugs() + + if update: + models.ProteinDrugInteraction.objects.filter(pdi_dataset=dataset).delete() df = DataLoader.load_pdi_chembl() - dataset, _ = models.PDIDataset.objects.get_or_create( - name='ChEMBL', - link='https://www.ebi.ac.uk/chembl/', - version='27', - ) - bulk = list() + bulk = set() for _, row in df.iterrows(): try: - protein = self.proteins[self.uniprot_to_ensembl[row['protein_ac']]] + protein = self.cache.get_protein_by_uniprot(row['protein_ac']) except KeyError: # continue if not found continue try: # try fetching drug - drug = self.drugs[row['drug_id']] + drug = self.cache.get_drug_by_drugbank(row['drug_id']) except KeyError: # continue if not found continue - bulk.append(models.ProteinDrugInteraction( + bulk.add(models.ProteinDrugInteraction( pdi_dataset=dataset, protein=protein, drug=drug @@ -274,36 +210,35 @@ class DataPopulator: models.ProteinDrugInteraction.objects.bulk_create(bulk) return len(bulk) - def populate_pdis_disgenet(self,) -> int: + def populate_pdis_disgenet(self, dataset, update) -> int: """ Populates the Protein-Disorder-Interactions from DisGeNET Handles Loading the data and passing it to the django database Returns: int: Count of how many interactions were added """ - self.init_proteins() - self.init_disorders() + self.cache.init_proteins() + self.cache.init_disorders() + + + if update: + models.ProteinDisorderAssociation.objects.filter(pdis_dataset=dataset).delete() df = DataLoader.load_pdis_disgenet() - dataset, _ = models.PDisDataset.objects.get_or_create( - name='DisGeNET', - link='https://www.disgenet.org/home/', - version='6.0', - ) - bulk = list() + bulk = set() for _, row in df.iterrows(): try: # try fetching protein - protein = self.proteins[self.uniprot_to_ensembl[row['protein_name']]] + protein = self.cache.get_protein_by_uniprot(row['protein_name']) except KeyError: # continue if not found continue try: - # try fetching drug - disorder = self.disorders[str(int(row['disorder_name']))] + # try fetching disorder + disorder = self.cache.get_disorder_by_mondo(row['disorder_name']) except KeyError: # continue if not found continue - bulk.append(models.ProteinDisorderAssociation( + bulk.add(models.ProteinDisorderAssociation( pdis_dataset=dataset, protein=protein, disorder=disorder, @@ -312,36 +247,34 @@ class DataPopulator: models.ProteinDisorderAssociation.objects.bulk_create(bulk) return len(bulk) - def populate_drdis_drugbank(self) -> int: + def populate_drdis_drugbank(self, dataset, update) -> int: """ Populates the Drug-Disorder-Indications from DrugBank Handles Loading the data and passing it to the django database Returns: int: Count of how many edges were added """ - self.init_drugs() - self.init_disorders() + self.cache.init_drugs() + self.cache.init_disorders() + if update: + models.DrugDisorderIndication.objects.filter(drdi_dataset=dataset).delete() + df = DataLoader.load_drdis_drugbank() - dataset, _ = models.DrDiDataset.objects.get_or_create( - name='DrugBank', - link='https://go.drugbank.com/', - version='5.1.8', - ) - bulk = list() + bulk = set() for _, row in df.iterrows(): try: # try fetching protein - drug = self.drugs[row['drugbank_id']] + drug = self.cache.get_drug_by_drugbank(row['drugbank_id']) except KeyError: # continue if not found continue try: # try fetching drug - disorder = self.disorders[str(int(row['mondo_id']))] + disorder = self.cache.get_disorder_by_mondo(row['mondo_id']) except KeyError: # continue if not found continue - bulk.append(models.DrugDisorderIndication( + bulk.add(models.DrugDisorderIndication( drdi_dataset=dataset, drug=drug, disorder=disorder, @@ -349,76 +282,75 @@ class DataPopulator: models.DrugDisorderIndication.objects.bulk_create(bulk) return len(bulk) - def populate_pdi_dgidb(self) -> int: + def populate_pdi_dgidb(self,dataset, update) -> int: """ Populates the Protein-Drug-Interactions from DGIdb Handles Loading the data and passing it to the django database Returns: int: Count of how many interactions were added """ - self.init_proteins() - self.init_drugs() + self.cache.init_proteins() + self.cache.init_drugs() + + if update: + models.ProteinDrugInteraction.objects.filter(pdi_dataset=dataset).delete() df = DataLoader.load_pdi_dgidb() - dataset, _ = models.PDIDataset.objects.get_or_create( - name='DGIdb', - link='https://www.dgidb.org/', - version='4.2.0' - ) - bulk = list() + bulk = set() for _, row in df.iterrows(): try: # try fetching protein - protein = self.proteins[row['entrez_id']] + proteins = self.cache.get_proteins_by_entrez(row['entrez_id']) except KeyError: # continue if not found continue try: # try fetching drug - drug = self.drugs[row['drug_id']] + drug = self.cache.get_drug_by_drugbank(row['drug_id']) except KeyError: # continue if not found continue - bulk.append(models.ProteinDrugInteraction( - pdi_dataset=dataset, - protein=protein, - drug=drug - )) + for protein in proteins: + bulk.add(models.ProteinDrugInteraction( + pdi_dataset=dataset, + protein=protein, + drug=drug + )) models.ProteinDrugInteraction.objects.bulk_create(bulk) return len(bulk) - def populate_pdi_drugbank(self) -> int: + def populate_pdi_drugbank(self,dataset, update) -> int: """ Populates the Protein-Drug-Interactions from Drugbank Handles Loading the data and passing it to the django database Returns: int: Count of how many interactions were added """ - self.init_proteins() - self.init_drugs() + self.cache.init_proteins() + self.cache.init_drugs() + + + if update: + models.ProteinDrugInteraction.objects.filter(pdi_dataset=dataset).delete() df = DataLoader.load_pdi_drugbank() - dataset, _ = models.PDIDataset.objects.get_or_create( - name='DrugBank', - link='https://go.drugbank.com/', - version='5.1.7' - ) - bulk = list() + bulk = set() for _, row in df.iterrows(): try: # try fetching protein - protein = self.proteins[row['entrez_id']] + proteins = self.cache.get_proteins_by_entrez(row['entrez_id']) except KeyError: # continue if not found continue try: # try fetching drug - drug = self.drugs[row['drug_id']] + drug = self.cache.get_drug_by_drugbank(row['drug_id']) except KeyError: # continue if not found continue - bulk.append(models.ProteinDrugInteraction( - pdi_dataset=dataset, - protein=protein, - drug=drug - )) + for protein in proteins: + bulk.add(models.ProteinDrugInteraction( + pdi_dataset=dataset, + protein=protein, + drug=drug + )) models.ProteinDrugInteraction.objects.bulk_create(bulk) return len(bulk) diff --git a/drugstone/management/includes/DatasetLoader.py b/drugstone/management/includes/DatasetLoader.py new file mode 100644 index 0000000000000000000000000000000000000000..f608ed1d693adf26c07fba73358408429e30675c --- /dev/null +++ b/drugstone/management/includes/DatasetLoader.py @@ -0,0 +1,99 @@ +from drugstone import models +from python_nedrex.static import get_metadata + +def get_ppi_string(): + dataset, _ = models.PPIDataset.objects.get_or_create( + name='STRING', + link='https://string-db.org/', + version='11.0' + ) + return dataset + +def get_ppi_apid(): + dataset, _ = models.PPIDataset.objects.get_or_create( + name='APID', + link='http://cicblade.dep.usal.es:8080/APID/', + version='January 2019' + ) + return dataset + +def get_ppi_biogrid(): + dataset, _ = models.PPIDataset.objects.get_or_create( + name='BioGRID', + link='https://thebiogrid.org/', + version='4.0' + ) + return dataset + +def get_drug_target_nedrex(url): + dataset, _ = models.PDIDataset.objects.get_or_create( + name='NeDRex', + link=url, + version=get_metadata()['version'], + ) + return dataset + +def get_ppi_nedrex(url): + dataset, _ = models.PPIDataset.objects.get_or_create( + name='NeDRex', + link=url, + version=get_metadata()['version'], + ) + return dataset + +def get_protein_disorder_nedrex(url): + dataset, _ = models.PDisDataset.objects.get_or_create( + name='NeDRex', + link=url, + version=get_metadata()['version'], + ) + return dataset + +def get_drug_disorder_nedrex(url): + dataset, _ = models.DrDiDataset.objects.get_or_create( + name='NeDRex', + link=url, + version=get_metadata()['version'], + ) + return dataset + +def get_drug_target_chembl(): + dataset, _ = models.PDIDataset.objects.get_or_create( + name='ChEMBL', + link='https://www.ebi.ac.uk/chembl/', + version='27', + ) + return dataset + +def get_drug_target_dgidb(): + dataset, _ = models.PDIDataset.objects.get_or_create( + name='DGIdb', + link='https://www.dgidb.org/', + version='4.2.0' + ) + return dataset + +def get_drug_target_drugbank(): + dataset, _ = models.PDIDataset.objects.get_or_create( + name='DrugBank', + link='https://go.drugbank.com/', + version='5.1.7' + ) + return dataset + +def get_disorder_protein_disgenet(): + dataset, _ = models.PDisDataset.objects.get_or_create( + name='DisGeNET', + link='https://www.disgenet.org/home/', + version='6.0', + ) + return dataset + + +def get_drug_disorder_drugbank(): + dataset, _ = models.DrDiDataset.objects.get_or_create( + name='DrugBank', + link='https://go.drugbank.com/', + version='5.1.8', + ) + return dataset diff --git a/drugstone/management/includes/NodeCache.py b/drugstone/management/includes/NodeCache.py new file mode 100644 index 0000000000000000000000000000000000000000..7f9491c0e52d16aede95abbf2b1824f2cdcd02aa --- /dev/null +++ b/drugstone/management/includes/NodeCache.py @@ -0,0 +1,60 @@ +from collections import defaultdict +import drugstone.models as models + + +class NodeCache: + + proteins = dict() + entrez_to_uniprot = defaultdict(lambda: set()) + gene_name_to_uniprot = defaultdict(lambda: set()) + disorders = dict() + drugs = dict() + + def init_protein_maps(self): + print("Generating protein id maps...") + for protein in self.proteins.values(): + self.entrez_to_uniprot[protein.entrez].add(protein.uniprot_code) + self.gene_name_to_uniprot[protein.gene].add(protein.uniprot_code) + + def init_proteins(self): + if len(self.proteins) == 0: + print("Generating protein maps...") + for protein in models.Protein.objects.all(): + self.proteins[protein.uniprot_code] = protein + if len(self.proteins) > 0 and (len(self.entrez_to_uniprot) == 0 or len(self.gene_name_to_uniprot) == 0): + self.init_protein_maps() + + + def init_drugs(self): + if len(self.drugs) == 0: + print("Generating drug map...") + for drug in models.Drug.objects.all(): + self.drugs[drug.drug_id] = drug + + def init_disorders(self): + if len(self.disorders) == 0: + print("Generating disorder map...") + for disorder in models.Disorder.objects.all(): + self.disorders[disorder.mondo_id] = disorder + + + def get_protein_by_uniprot(self,uniprot_id): + return self.proteins[uniprot_id] + + def get_proteins_by_entrez(self,entrez_id): + out = list() + for g in self.entrez_to_uniprot[entrez_id]: + out.append(self.proteins[g]) + return out + + def get_proteins_by_gene(self, gene_name): + out = list() + for g in self.gene_name_to_uniprot[gene_name]: + out.append(self.proteins[g]) + return out + + def get_drug_by_drugbank(self, drugbank_id): + return self.drugs[drugbank_id] + + def get_disorder_by_mondo(self, mondo_id): + return self.disorders[mondo_id] \ No newline at end of file diff --git a/drugstone/models.py b/drugstone/models.py index 92991226b0873e9f762450e54c8b3818ecb16500..4f160dbe5101281d53520f54a353835cf9c10570 100755 --- a/drugstone/models.py +++ b/drugstone/models.py @@ -5,13 +5,6 @@ from django.db import models # Main biological and medical entities -class Tissue(models.Model): - name = models.CharField(max_length=128, default='', unique=True) - - def __str__(self): - return self.name - - class PPIDataset(models.Model): name = models.CharField(max_length=128, default='', unique=False) link = models.CharField(max_length=128, default='', unique=False) @@ -60,18 +53,9 @@ class DrDiDataset(models.Model): unique_together = ('name', 'version') -class ExpressionLevel(models.Model): - tissue = models.ForeignKey('Tissue', on_delete=models.CASCADE) - protein = models.ForeignKey('Protein', on_delete=models.CASCADE) - expression_level = models.FloatField() - - class Meta: - unique_together = ('tissue', 'protein') - - -# class EnsemblGene(models.Model): -# name = models.CharField(max_length=15, unique=True) # starts with ENSG... -# protein = models.ForeignKey('Protein', on_delete=models.CASCADE, related_name='ensg') +class EnsemblGene(models.Model): + name = models.CharField(max_length=15) # starts with ENSG... + protein = models.ForeignKey('Protein', on_delete=models.CASCADE, related_name='ensg') class Protein(models.Model): @@ -84,6 +68,7 @@ class Protein(models.Model): entrez = models.CharField(max_length=15, default='') drugs = models.ManyToManyField('Drug', through='ProteinDrugInteraction', related_name='interacting_drugs') + ensembl = models.CharField(max_length=15, default='') tissue_expression = models.ManyToManyField('Tissue', through='ExpressionLevel', related_name='interacting_drugs') @@ -106,6 +91,25 @@ class Protein(models.Model): self.entrez = other.entrez +class ExpressionLevel(models.Model): + tissue = models.ForeignKey('Tissue', on_delete=models.CASCADE) + protein = models.ForeignKey('Protein', on_delete=models.CASCADE) + expression_level = models.FloatField() + + class Meta: + unique_together = ('tissue', 'protein') + + def __hash__(self): + return hash(f'{self.tissue_id}_{self.protein_id}') + + +class Tissue(models.Model): + name = models.CharField(max_length=128, default='', unique=True) + + def __str__(self): + return self.name + + class Disorder(models.Model): mondo_id = models.CharField(max_length=7) label = models.CharField(max_length=256, default='') # symbol @@ -143,7 +147,7 @@ class Drug(models.Model): return self.drug_id def __eq__(self, other): - return self.drug_id == other.uniprot_code and self.name == other.name and self.status == other.status + return self.drug_id == other.drug_id and self.name == other.name and self.status == other.status def __ne__(self, other): return not self.__eq__(other) @@ -168,6 +172,15 @@ class ProteinDisorderAssociation(models.Model): def __str__(self): return f'{self.pdis_dataset}-{self.protein}-{self.disorder}' + def __eq__(self, other): + return self.pdis_dataset_id == other.pdis_dataset_id and self.protein_id == other.protein_id and self.disorder_id == other.disorder_id + + def __ne__(self, other): + return not self.__eq__(other) + + def __hash__(self): + return hash((self.pdis_dataset_id, self.protein_id, self.disorder_id)) + class DrugDisorderIndication(models.Model): drdi_dataset = models.ForeignKey( @@ -181,6 +194,15 @@ class DrugDisorderIndication(models.Model): def __str__(self): return f'{self.drdi_dataset}-{self.drug}-{self.disorder}' + def __eq__(self, other): + return self.drdi_dataset_id == other.drdi_dataset_id and self.drug_id == other.drug_id and self.disorder_id == other.disorder_id + + def __ne__(self, other): + return not self.__eq__(other) + + def __hash__(self): + return hash((self.drdi_dataset_id, self.drug_id, self.disorder_id)) + class ProteinProteinInteraction(models.Model): ppi_dataset = models.ForeignKey( @@ -210,10 +232,19 @@ class ProteinProteinInteraction(models.Model): def __str__(self): return f'{self.ppi_dataset}-{self.from_protein}-{self.to_protein}' + def __eq__(self, other): + return self.ppi_dataset_id == other.ppi_dataset_id and self.from_protein_id == other.from_protein_id and self.to_protein_id == other.to_protein_id + + def __ne__(self, other): + return not self.__eq__(other) + + def __hash__(self): + return hash((self.ppi_dataset_id, self.from_protein_id, self.to_protein_id)) + class ProteinDrugInteraction(models.Model): pdi_dataset = models.ForeignKey( - 'PDIDataset', null=True, on_delete=models.CASCADE, related_name='pdi_dataset_relation') + PDIDataset, null=True, on_delete=models.CASCADE, related_name='pdi_dataset_relation') protein = models.ForeignKey('Protein', on_delete=models.CASCADE) drug = models.ForeignKey('Drug', on_delete=models.CASCADE) @@ -223,6 +254,15 @@ class ProteinDrugInteraction(models.Model): def __str__(self): return f'{self.pdi_dataset}-{self.protein}-{self.drug}' + def __eq__(self, other): + return self.pdi_dataset_id == other.pdi_dataset_id and self.protein_id == other.protein_id and self.drug_id == other.drug_id + + def __ne__(self, other): + return not self.__eq__(other) + + def __hash__(self): + return hash((self.pdi_dataset_id, self.protein_id, self.drug_id)) + class Task(models.Model): token = models.CharField(max_length=32, unique=True) diff --git a/docker-entrypoint.sh b/scripts/docker-entrypoint.sh similarity index 63% rename from docker-entrypoint.sh rename to scripts/docker-entrypoint.sh index 9686a3cf6e51f0d2ee352a4eb06e1d0deb97e0c5..6dc4cd8b33ca3885b1507ec6296168651dcde399 100755 --- a/docker-entrypoint.sh +++ b/scripts/docker-entrypoint.sh @@ -3,13 +3,17 @@ file="docker-entrypoint.lock" # exit if entrypoint.lock exists to prevent new import of data every time docker is restarted -python3 manage.py makemigrations drugstone -python3 manage.py migrate + if ! test -f "$file"; then +# sh scripts/import-data.sh + python3 manage.py makemigrations drugstone + python3 manage.py migrate python3 manage.py createfixtures python3 manage.py cleanuptasks - sh scripts/import-data.sh + python3 manage.py populate_db -u --all + python3 manage.py make_graphs + touch $file fi diff --git a/scripts/import-data.sh b/scripts/import-data.sh index de1c938d052f5bd9c3845f549a67f785391c7254..ea358a3d04ea1699df154c4ad1f09d00d0e482d0 100755 --- a/scripts/import-data.sh +++ b/scripts/import-data.sh @@ -1,11 +1,15 @@ #!/bin/bash -python3 manage.py populate_db --delete_model PPI,PDI,Drug,Protein,Tissue,Disorder,PDiAssociations +# python3 manage.py populate_db --delete_model PPI,PDI,PDi,DrDi,Drug,Protein,Tissue,Expression,Disorder +#python3 manage.py populate_db --all -u +#python3 manage.py populate_db -p -u +#python3 manage.py populate_db --clear --all +#python3 manage.py populate_db --delete_model Disorder +#python3 manage.py populate_db -u --all +#python3 manage.py populate_db --delete_model PDI --data_dir . -pdr +#python3 manage.py populate_db --data_dir . -exp gene_tissue_expression.gct -python3 manage.py populate_db --data_dir . -p -python3 manage.py populate_db --data_dir . -exp gene_tissue_expression.gct - -python3 manage.py populate_db --data_dir . -dr +#python3 manage.py populate_db -dr #python3 manage.py populate_db --data_dir . -pdr drug-protein-interaction.txt -python3 manage.py populate_db -di +#python3 manage.py populate_db -di #python3 manage.py populate_db --data_dir . -pdi "" -ddi "" #python3 manage.py populate_db -pp protein_protein_interaction_file.txt \ No newline at end of file diff --git a/tasks/betweenness_centrality_test.py b/tasks/betweenness_centrality_test.py index 4edb6026f9ce7b34ef876840b17b38a5d051e8f5..b1a70f8e021d0e5b66e67909e89475a8319e0299 100755 --- a/tasks/betweenness_centrality_test.py +++ b/tasks/betweenness_centrality_test.py @@ -21,7 +21,7 @@ def betweenness_centrality_test(algorithm, parameters): print() print(results.get('node_attributes')) - task_hook = TaskHook(parameters, '../data_drugstone/', set_progress, set_result) + task_hook = TaskHook(parameters, '../data/', set_progress, set_result) algorithm(task_hook) class Range(object): diff --git a/tasks/closeness_centrality_test.py b/tasks/closeness_centrality_test.py index 9275dd40f8045110c1c7f7d05396ec119ef66165..095b0b325c2ba5c4bcf42f4e53f45a8092e76796 100755 --- a/tasks/closeness_centrality_test.py +++ b/tasks/closeness_centrality_test.py @@ -20,7 +20,7 @@ def closeness_centrality_test(algorithm, parameters): print() print(results.get('node_attributes')) - task_hook = TaskHook(parameters, '../data_drugstone/', set_progress, set_result) + task_hook = TaskHook(parameters, '../data/', set_progress, set_result) algorithm(task_hook) class Range(object): diff --git a/tasks/degree_centrality_test.py b/tasks/degree_centrality_test.py index b0825028f10486729727bf383b78192000325fd3..10596c53b73997f56f5227bd1aa40ec890f44337 100755 --- a/tasks/degree_centrality_test.py +++ b/tasks/degree_centrality_test.py @@ -20,7 +20,7 @@ def degree_centrality_test(algorithm, parameters): print() print(results.get('node_attributes')) - task_hook = TaskHook(parameters, '../data_drugstone/', set_progress, set_result) + task_hook = TaskHook(parameters, '../data/', set_progress, set_result) algorithm(task_hook) class Range(object): diff --git a/tasks/keypathwayminer_test.py b/tasks/keypathwayminer_test.py index d08fdb7451ea0202560fcd07275a4ee48162d0a0..f3e1383f0817d6c2e285fd286a58ea43db73877f 100755 --- a/tasks/keypathwayminer_test.py +++ b/tasks/keypathwayminer_test.py @@ -19,7 +19,7 @@ def task_test(algorithm): print(f' Edge #{j + 1}: {edge["from"]} -> {edge["to"]}') print() - task_hook = TaskHook({'k': 1, 'seeds': ['Q9BS26', 'O00124', 'P33527']}, '../data_drugstone/', set_progress, set_result) + task_hook = TaskHook({'k': 1, 'seeds': ['Q9BS26', 'O00124', 'P33527']}, '../data/', set_progress, set_result) algorithm(task_hook) diff --git a/tasks/multi_steiner_test.py b/tasks/multi_steiner_test.py index 6fd87340fb13f2821e92d57ff568ef0cfa977468..dbf5c7e0b50d8e0123b84e80b91a189ab7269de1 100755 --- a/tasks/multi_steiner_test.py +++ b/tasks/multi_steiner_test.py @@ -20,7 +20,7 @@ def multi_steiner_test(algorithm, parameters): print() print(results.get('node_attributes')) - task_hook = TaskHook(parameters, '../data_drugstone/', set_progress, set_result) + task_hook = TaskHook(parameters, '../data/', set_progress, set_result) algorithm(task_hook) if __name__ == '__main__': diff --git a/tasks/network_proximity_test.py b/tasks/network_proximity_test.py index 1bbb179c26b1f436f316973e35cc12db135e6b47..9512b0a4ee939ff30289886a08f0b3c4c5441d17 100755 --- a/tasks/network_proximity_test.py +++ b/tasks/network_proximity_test.py @@ -21,7 +21,7 @@ def network_proximity_test(algorithm, parameters): print() print(results.get('node_attributes')) - task_hook = TaskHook(parameters, '../data_drugstone/', set_progress, set_result) + task_hook = TaskHook(parameters, '../data/', set_progress, set_result) algorithm(task_hook) diff --git a/tasks/task_hook.py b/tasks/task_hook.py index 9b8a669c4cdbde6ef60c3658ea37cf83bee43656..42b860a00427f24bb0218fcdfa06ff1afb980cb2 100755 --- a/tasks/task_hook.py +++ b/tasks/task_hook.py @@ -29,7 +29,7 @@ class TaskHook: """ Returns the data directory including trailing slash. - :return: Data directory (e.g. '/app/data_drugstone/') + :return: Data directory (e.g. '/app/data/') """ return self.__data_directory diff --git a/tasks/task_test.py b/tasks/task_test.py index 206f9e8e81cd12efdcb5d0e28a601374b98b1276..7cb9db5cc81d41d16b5dfcb255725682ca2caf53 100755 --- a/tasks/task_test.py +++ b/tasks/task_test.py @@ -19,7 +19,7 @@ def task_test(algorithm): print(f' Edge #{j + 1}: {edge["from"]} -> {edge["to"]}') print() - task_hook = TaskHook({'seeds': ['Q9BS26', 'O00124', 'P33527']}, '../data_drugstone/', set_progress, set_result) + task_hook = TaskHook({'seeds': ['Q9BS26', 'O00124', 'P33527']}, '../data/', set_progress, set_result) algorithm(task_hook) diff --git a/tasks/trust_rank_test.py b/tasks/trust_rank_test.py index 62d2b91a028b578e21862ca6ca1ea0498f22d91a..bc9640d6816286a2f438bcdc33231e69d85384d5 100755 --- a/tasks/trust_rank_test.py +++ b/tasks/trust_rank_test.py @@ -20,7 +20,7 @@ def trust_rank_test(algorithm, parameters): print() print(results.get('node_attributes')) - task_hook = TaskHook(parameters, '../data_drugstone/', set_progress, set_result) + task_hook = TaskHook(parameters, '../data/', set_progress, set_result) algorithm(task_hook)