From 3cdabc51ec917f152790494b59c7a6a074c27788 Mon Sep 17 00:00:00 2001 From: AndiMajore <andi.majore@googlemail.com> Date: Mon, 4 Jul 2022 23:08:49 +0200 Subject: [PATCH] fixed updating Former-commit-id: 28e944a1579943ee2bc7247cfc99750b42840cb9 --- docker-compose.yml | 2 + .../management/commands/import_from_nedrex.py | 41 ++++++++++++++----- drugstone/management/commands/populate_db.py | 29 ++++++++++--- .../management/includes/DataPopulator.py | 3 +- drugstone/management/includes/NodeCache.py | 35 ++++++++-------- drugstone/tasks.py | 9 ++-- scripts/docker-entrypoint.sh | 4 +- 7 files changed, 86 insertions(+), 37 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 295eb0d..67a5150 100755 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -60,6 +60,8 @@ services: image: drugstone_backend container_name: drugstone_celery hostname: drugstone_celery + volumes: + - drugstone_data_volume:/usr/src/drugstone/data env_file: - './docker-django.env.dev' depends_on: diff --git a/drugstone/management/commands/import_from_nedrex.py b/drugstone/management/commands/import_from_nedrex.py index 7f5c889..6f11679 100644 --- a/drugstone/management/commands/import_from_nedrex.py +++ b/drugstone/management/commands/import_from_nedrex.py @@ -112,10 +112,12 @@ class NedrexImporter: (updates, creates) = identify_updates(proteins, self.cache.proteins) for u in updates: u.save() + self.cache.proteins[u.uniprot_code] = u models.Protein.objects.bulk_create(creates) for protein in creates: self.cache.proteins[protein.uniprot_code] = protein self.cache.protein_updates.add(protein.uniprot_code) + self.cache.init_protein_maps() return len(creates) else: models.Protein.objects.bulk_create(proteins.values()) @@ -180,13 +182,18 @@ class NedrexImporter: self.cache.init_proteins() bulk = set() + existing = set() + if update: + for edge in models.ProteinDrugInteraction.objects.filter(pdi_dataset=dataset): + existing.add(edge.__hash__()) def add_dpi(edge): try: drug = self.cache.get_drug_by_drugbank(to_id(edge['sourceDomainId'])) protein = self.cache.get_protein_by_uniprot(to_id(edge['targetDomainId'])) - if not update or (self.cache.is_new_drug(drug) or self.cache.is_new_protein(protein)): - bulk.add(models.ProteinDrugInteraction(pdi_dataset=dataset, drug=drug, protein=protein)) + e = models.ProteinDrugInteraction(pdi_dataset=dataset, drug=drug, protein=protein) + if not update or e.__hash__() not in existing: + bulk.add(e) except KeyError: pass @@ -198,6 +205,10 @@ class NedrexImporter: self.cache.init_proteins() bulk = list() + existing = set() + if update: + for edge in models.ProteinProteinInteraction.objects.filter(ppi_dataset=dataset): + existing.add(edge.__hash__()) def iter_ppi(eval): from python_nedrex import ppi @@ -215,9 +226,9 @@ class NedrexImporter: try: protein1 = self.cache.get_protein_by_uniprot(to_id(edge['memberOne'])) protein2 = self.cache.get_protein_by_uniprot(to_id(edge['memberTwo'])) - if not update or (self.cache.is_new_protein(protein1) or self.cache.is_new_protein(protein2)): - bulk.append(models.ProteinProteinInteraction(ppi_dataset=dataset, from_protein=protein1, - to_protein=protein2)) + e = models.ProteinProteinInteraction(ppi_dataset=dataset, from_protein=protein1,to_protein=protein2) + if not update or e.__hash__() not in existing: + bulk.append(e) except KeyError: pass @@ -230,14 +241,19 @@ class NedrexImporter: self.cache.init_proteins() bulk = set() + existing = set() + if update: + for edge in models.ProteinDisorderAssociation.objects.filter(pdis_dataset=dataset): + existing.add(edge.__hash__()) def add_pdis(edge): try: disorder = self.cache.get_disorder_by_mondo(to_id(edge['targetDomainId'])) for protein in self.cache.get_proteins_by_entrez(to_id(edge['sourceDomainId'])): - if not update or (self.cache.is_new_disease(disorder) or self.cache.is_new_protein(protein)): - bulk.add(models.ProteinDisorderAssociation(pdis_dataset=dataset, protein=protein, - disorder=disorder, score=edge['score'])) + e = models.ProteinDisorderAssociation(pdis_dataset=dataset, protein=protein, disorder=disorder, + score=edge['score']) + if not update or e.__hash__() not in existing: + bulk.add(e) except KeyError: pass @@ -250,13 +266,18 @@ class NedrexImporter: self.cache.init_drugs() bulk = set() + existing = set() + if update: + for edge in models.DrugDisorderIndication.objects.filter(drdi_dataset=dataset): + existing.add(edge.__hash__()) def add_drdis(edge): try: drug = self.cache.get_drug_by_drugbank(to_id(edge['sourceDomainId'])) disorder = self.cache.get_disorder_by_mondo(to_id(edge['targetDomainId'])) - if not update or (self.cache.is_new_drug(drug) or self.cache.is_new_disease(disorder)): - bulk.add(models.DrugDisorderIndication(drdi_dataset=dataset, drug=drug, disorder=disorder)) + e = models.DrugDisorderIndication(drdi_dataset=dataset, drug=drug, disorder=disorder) + if not update or e.__hash__() not in existing: + bulk.add(e) except KeyError: pass diff --git a/drugstone/management/commands/populate_db.py b/drugstone/management/commands/populate_db.py index d1083a2..476d60d 100755 --- a/drugstone/management/commands/populate_db.py +++ b/drugstone/management/commands/populate_db.py @@ -12,8 +12,6 @@ from drugstone.management.includes.NodeCache import NodeCache from drugstone.management.includes import DatasetLoader - - class DatabasePopulator: def __init__(self, data_dir): self.data_dir = data_dir @@ -93,10 +91,10 @@ def populate(kwargs): db_populator = DatabasePopulator(data_dir=data_dir) - if kwargs['clear']: + if 'clear' in kwargs and kwargs['clear']: db_populator.delete_all() - if kwargs['delete_model'] is not None: + if 'delete_model' in kwargs and kwargs['delete_model'] is not None: model_list = kwargs['delete_model'].split(',') db_populator.delete_models(model_list) @@ -105,7 +103,8 @@ def populate(kwargs): importer = NedrexImporter(nedrex_api_url, cache) populator = DataPopulator(cache) - if kwargs['all']: + total_n = 0 + if 'all' in kwargs and kwargs['all']: kwargs['drugs'] = True kwargs['disorders'] = True kwargs['proteins'] = True @@ -118,24 +117,29 @@ def populate(kwargs): if kwargs['drugs']: print('Populating Drugs...') n = NedrexImporter.import_drugs(importer, update) + total_n +=n print(f'Populated {n} Drugs.') if kwargs['disorders']: print('Populating Disorders...') n = NedrexImporter.import_disorders(importer, update) + total_n += n print(f'Populated {n} Disorders.') if kwargs['proteins']: print('Populating Proteins...') n = NedrexImporter.import_proteins(importer, update) + total_n += n print(f'Populated {n} Proteins.') print('Populating ENSG IDs...') n = DataPopulator.populate_ensg(populator, update) + total_n += n print(f'Populated {n} ENSG IDs.') if kwargs['exp']: print('Populating Expressions...') n = DataPopulator.populate_expressions(populator, update) + total_n += n print(f'Populated {n} Expressions.') if kwargs['protein_protein']: @@ -143,17 +147,21 @@ def populate(kwargs): n = NedrexImporter.import_protein_protein_interactions(importer, DatasetLoader.get_ppi_nedrex(nedrex_api_url), update) + total_n += n print(f'Imported {n} PPIs from NeDRexDB') print('Populating PPIs from STRING...') n = DataPopulator.populate_ppi_string(populator, DatasetLoader.get_ppi_string(), update) + total_n += n print(f'Populated {n} PPIs from STRING.') print('Populating PPIs from APID...') n = DataPopulator.populate_ppi_apid(populator, DatasetLoader.get_ppi_apid(), update) + total_n += n print(f'Populated {n} PPIs from APID.') print('Populating PPIs from BioGRID...') n = DataPopulator.populate_ppi_biogrid(populator, DatasetLoader.get_ppi_biogrid(), update) + total_n += n print(f'Populated {n} PPIs from BioGRID.') if kwargs['protein_drug']: @@ -161,18 +169,22 @@ def populate(kwargs): n = NedrexImporter.import_drug_target_interactions(importer, DatasetLoader.get_drug_target_nedrex(nedrex_api_url), update) + total_n += n print(f'Imported {n} PDIs from NeDRexDB') print('Populating PDIs from Chembl...') n = DataPopulator.populate_pdi_chembl(populator, DatasetLoader.get_drug_target_chembl(), update) + total_n += n print(f'Populated {n} PDIs from Chembl.') print('Populating PDIs from DGIdb...') n = DataPopulator.populate_pdi_dgidb(populator, DatasetLoader.get_drug_target_dgidb(), update) + total_n += n print(f'Populated {n} PDIs from DGIdb.') print('Populating PDIs from DrugBank...') n = DataPopulator.populate_pdi_drugbank(populator, DatasetLoader.get_drug_target_drugbank(), update) + total_n += n print(f'Populated {n} PDIs from DrugBank.') if kwargs['protein_disorder']: @@ -181,9 +193,11 @@ def populate(kwargs): DatasetLoader.get_protein_disorder_nedrex( nedrex_api_url), update) + total_n += n print(f'Imported {n} PDis from NeDRexDB') print('Populating PDis associations from DisGeNET...') n = DataPopulator.populate_pdis_disgenet(populator, DatasetLoader.get_disorder_protein_disgenet(), update) + total_n += n print(f'Populated {n} PDis associations from DisGeNET.') if kwargs['drug_disorder']: @@ -191,7 +205,12 @@ def populate(kwargs): n = NedrexImporter.import_drug_disorder_indications(importer, DatasetLoader.get_drug_disorder_nedrex(nedrex_api_url), update) + total_n += n print(f'Imported {n} DrDis from NeDRexDB') print('Populating DrDi indications from DrugBank...') n = DataPopulator.populate_drdis_drugbank(populator, DatasetLoader.get_drug_disorder_drugbank(), update) + total_n += n print(f'Populated {n} DrDi associations from DrugBank.') + + cache.clear() + return total_n diff --git a/drugstone/management/includes/DataPopulator.py b/drugstone/management/includes/DataPopulator.py index b31055f..677eac3 100755 --- a/drugstone/management/includes/DataPopulator.py +++ b/drugstone/management/includes/DataPopulator.py @@ -15,7 +15,8 @@ class DataPopulator: tissues_models = dict() for tissue_name in df.columns.values[2:]: - tissues_models[tissue_name] = models.Tissue.objects.get_or_create(name=tissue_name) + tissue,_ = models.Tissue.objects.get_or_create(name=tissue_name) + tissues_models[tissue_name] = tissue proteins_linked = 0 bulk = set() diff --git a/drugstone/management/includes/NodeCache.py b/drugstone/management/includes/NodeCache.py index 5df92d8..fdb9529 100644 --- a/drugstone/management/includes/NodeCache.py +++ b/drugstone/management/includes/NodeCache.py @@ -3,7 +3,6 @@ import drugstone.models as models class NodeCache: - proteins = dict() entrez_to_uniprot = defaultdict(lambda: set()) gene_name_to_uniprot = defaultdict(lambda: set()) @@ -14,8 +13,21 @@ class NodeCache: disorder_updates = set() protein_updates = set() + def clear(self): + self.proteins = dict() + self.entrez_to_uniprot = defaultdict(lambda: set()) + self.gene_name_to_uniprot = defaultdict(lambda: set()) + self.disorders = dict() + self.drugs = dict() + + self.drug_updates = set() + self.disorder_updates = set() + self.protein_updates = set() + def init_protein_maps(self): print("Generating protein id maps...") + self.entrez_to_uniprot = defaultdict(lambda: set()) + self.gene_name_to_uniprot = defaultdict(lambda: set()) for protein in self.proteins.values(): self.entrez_to_uniprot[protein.entrez].add(protein.uniprot_code) self.gene_name_to_uniprot[protein.gene].add(protein.uniprot_code) @@ -24,9 +36,6 @@ class NodeCache: if len(self.proteins) == 0: print("Generating protein maps...") for protein in models.Protein.objects.all(): - if protein.id < 1000: - protein.delete() - continue self.proteins[protein.uniprot_code] = protein if len(self.proteins) > 0 and (len(self.entrez_to_uniprot) == 0 or len(self.gene_name_to_uniprot) == 0): self.init_protein_maps() @@ -35,33 +44,27 @@ class NodeCache: if len(self.drugs) == 0: print("Generating drug map...") for drug in models.Drug.objects.all(): - if drug.id < 1000: - drug.delete() - continue self.drugs[drug.drug_id] = drug def init_disorders(self): if len(self.disorders) == 0: print("Generating disorder map...") for disorder in models.Disorder.objects.all(): - if disorder.id < 1000: - disorder.delete() - continue self.disorders[disorder.mondo_id] = disorder - def is_new_protein(self, protein:models.Protein): + def is_new_protein(self, protein: models.Protein): return protein.uniprot_code in self.protein_updates - def is_new_drug(self, drug:models.Drug): + def is_new_drug(self, drug: models.Drug): return drug.drug_id in self.drug_updates - def is_new_disease(self, disease:models.Disorder): + def is_new_disease(self, disease: models.Disorder): return disease.mondo_id in self.disorder_updates - def get_protein_by_uniprot(self,uniprot_id): + def get_protein_by_uniprot(self, uniprot_id): return self.proteins[uniprot_id] - def get_proteins_by_entrez(self,entrez_id): + def get_proteins_by_entrez(self, entrez_id): out = list() for g in self.entrez_to_uniprot[entrez_id]: out.append(self.proteins[g]) @@ -77,4 +80,4 @@ class NodeCache: return self.drugs[drugbank_id] def get_disorder_by_mondo(self, mondo_id): - return self.disorders[mondo_id] \ No newline at end of file + return self.disorders[mondo_id] diff --git a/drugstone/tasks.py b/drugstone/tasks.py index 03449fc..97c5ac7 100644 --- a/drugstone/tasks.py +++ b/drugstone/tasks.py @@ -6,13 +6,16 @@ from drugstone.management.commands.make_graphs import run as make_graphs logger = get_task_logger(__name__) nedrex_api_url = "http://82.148.225.92:8123/" +data_dir = "/usr/src/drugstone/data" @shared_task def task_update_db_from_nedrex(): logger.info('Updating DB from NeDRex.') logger.info('Updating data...') - populate({"all": True, "update": True}) - logger.info('Recreating networks...') - make_graphs() + n = populate({"all": True, "update": True, "data_dir": data_dir}) + logger.info(f'Added {n} entries!') + if n > 0: + logger.info('Recreating networks...') + make_graphs() logger.info('Done.') diff --git a/scripts/docker-entrypoint.sh b/scripts/docker-entrypoint.sh index 142a9f5..a525f29 100755 --- a/scripts/docker-entrypoint.sh +++ b/scripts/docker-entrypoint.sh @@ -5,7 +5,7 @@ file="store/docker-entrypoint.lock" -#if ! test -f "$file"; then +if ! test -f "$file"; then # sh scripts/import-data.sh python3 manage.py makemigrations drugstone python3 manage.py migrate @@ -14,6 +14,6 @@ file="store/docker-entrypoint.lock" python3 manage.py populate_db -u --all python3 manage.py make_graphs touch $file -#fi +fi /usr/bin/supervisord -c "/etc/supervisor/conf.d/supervisord.conf" -- GitLab