diff --git a/drugstone/management/commands/import_from_nedrex.py b/drugstone/management/commands/import_from_nedrex.py index b3af226edb95f72395502c6fbae705dbef62b3c1..7f5c8890c20ba4daf3f113259344e72e0705e1e2 100644 --- a/drugstone/management/commands/import_from_nedrex.py +++ b/drugstone/management/commands/import_from_nedrex.py @@ -107,7 +107,6 @@ class NedrexImporter: proteins = with_entrez iter_node_collection('gene', add_genes) - # TODO test updating ideas if update: (updates, creates) = identify_updates(proteins, self.cache.proteins) @@ -116,6 +115,8 @@ class NedrexImporter: models.Protein.objects.bulk_create(creates) for protein in creates: self.cache.proteins[protein.uniprot_code] = protein + self.cache.protein_updates.add(protein.uniprot_code) + return len(creates) else: models.Protein.objects.bulk_create(proteins.values()) self.cache.proteins = proteins @@ -132,14 +133,16 @@ class NedrexImporter: iter_node_collection('drug', add_drug) - # TODO test updating ideas if update: (updates, creates) = identify_updates(drugs, self.cache.drugs) for u in updates: u.save() + models.Drug.objects.bulk_create(creates) for drug in creates: + self.cache.drug_updates.add(drug.drug_id) self.cache.drugs[drug.drug_id] = drug + return len(creates) else: models.Drug.objects.bulk_create(drugs.values()) self.cache.drugs = drugs @@ -157,14 +160,15 @@ class NedrexImporter: iter_node_collection('disorder', add_disorder) - # TODO test updating ideas if update: (updates, creates) = identify_updates(disorders, self.cache.disorders) for u in updates: u.save() models.Disorder.objects.bulk_create(creates) for disorder in creates: + self.cache.disorder_updates.add(disorder.mondo_id) self.cache.disorders[disorder.mondo_id] = disorder + return len(creates) else: models.Disorder.objects.bulk_create(disorders.values()) self.cache.disorders = disorders @@ -175,18 +179,14 @@ class NedrexImporter: self.cache.init_drugs() self.cache.init_proteins() - if update: - models.ProteinDrugInteraction.objects.filter(pdi_dataset=dataset).delete() - bulk = set() def add_dpi(edge): try: - bulk.add(models.ProteinDrugInteraction(pdi_dataset=dataset, - drug=self.cache.get_drug_by_drugbank( - to_id(edge['sourceDomainId'])), - protein=self.cache.get_protein_by_uniprot( - to_id(edge['targetDomainId'])))) + drug = self.cache.get_drug_by_drugbank(to_id(edge['sourceDomainId'])) + protein = self.cache.get_protein_by_uniprot(to_id(edge['targetDomainId'])) + if not update or (self.cache.is_new_drug(drug) or self.cache.is_new_protein(protein)): + bulk.add(models.ProteinDrugInteraction(pdi_dataset=dataset, drug=drug, protein=protein)) except KeyError: pass @@ -197,9 +197,6 @@ class NedrexImporter: def import_protein_protein_interactions(self, dataset, update): self.cache.init_proteins() - if update: - models.ProteinProteinInteraction.objects.filter(ppi_dataset=dataset).delete() - bulk = list() def iter_ppi(eval): @@ -216,11 +213,11 @@ class NedrexImporter: def add_ppi(edge): try: - bulk.append(models.ProteinProteinInteraction(ppi_dataset=dataset, - from_protein=self.cache.get_protein_by_uniprot( - to_id(edge['memberOne'])), - to_protein=self.cache.get_protein_by_uniprot( - to_id(edge['memberTwo'])))) + protein1 = self.cache.get_protein_by_uniprot(to_id(edge['memberOne'])) + protein2 = self.cache.get_protein_by_uniprot(to_id(edge['memberTwo'])) + if not update or (self.cache.is_new_protein(protein1) or self.cache.is_new_protein(protein2)): + bulk.append(models.ProteinProteinInteraction(ppi_dataset=dataset, from_protein=protein1, + to_protein=protein2)) except KeyError: pass @@ -232,18 +229,15 @@ class NedrexImporter: self.cache.init_disorders() self.cache.init_proteins() - if update: - models.ProteinDisorderAssociation.objects.filter(pdis_dataset=dataset).delete() - bulk = set() def add_pdis(edge): try: disorder = self.cache.get_disorder_by_mondo(to_id(edge['targetDomainId'])) for protein in self.cache.get_proteins_by_entrez(to_id(edge['sourceDomainId'])): - bulk.add(models.ProteinDisorderAssociation(pdis_dataset=dataset, - protein=protein, - disorder=disorder, score=edge['score'])) + if not update or (self.cache.is_new_disease(disorder) or self.cache.is_new_protein(protein)): + bulk.add(models.ProteinDisorderAssociation(pdis_dataset=dataset, protein=protein, + disorder=disorder, score=edge['score'])) except KeyError: pass @@ -255,18 +249,14 @@ class NedrexImporter: self.cache.init_disorders() self.cache.init_drugs() - if update: - models.DrugDisorderIndication.objects.filter(drdi_dataset=dataset).delete() - bulk = set() def add_drdis(edge): try: - bulk.add(models.DrugDisorderIndication(drdi_dataset=dataset, - drug=self.cache.get_drug_by_drugbank( - to_id(edge['sourceDomainId'])), - disorder=self.cache.get_disorder_by_mondo( - to_id(edge['targetDomainId'])))) + drug = self.cache.get_drug_by_drugbank(to_id(edge['sourceDomainId'])) + disorder = self.cache.get_disorder_by_mondo(to_id(edge['targetDomainId'])) + if not update or (self.cache.is_new_drug(drug) or self.cache.is_new_disease(disorder)): + bulk.add(models.DrugDisorderIndication(drdi_dataset=dataset, drug=drug, disorder=disorder)) except KeyError: pass diff --git a/drugstone/management/commands/make_graphs.py b/drugstone/management/commands/make_graphs.py index a6c3d815fe20d0a355a93f133aea4bd0fd29ec6c..e024555cc98ef84ff9e8257297a745321b52a5de 100755 --- a/drugstone/management/commands/make_graphs.py +++ b/drugstone/management/commands/make_graphs.py @@ -187,19 +187,22 @@ class Command(BaseCommand): pass def handle(self, *args, **kwargs): - ppi_datasets = models.PPIDataset.objects.all() - ppi_datasets_names = [e.name for e in ppi_datasets] - - pdi_datasets = models.PDIDataset.objects.all() - pdi_datasets_names = [e.name for e in pdi_datasets] - - parameter_combinations = [] - for protein_interaction_dataset in ppi_datasets_names: - for pdi_dataset in pdi_datasets_names: - parameter_combinations.append((protein_interaction_dataset, pdi_dataset)) - - # close all database connections so subprocesses will create their own connections - # this prevents the processes from running into problems because of using the same connection - db.connections.close_all() - pool = multiprocessing.Pool(KERNEL) - pool.map(create_gt, parameter_combinations) + run() + +def run(): + ppi_datasets = models.PPIDataset.objects.all() + ppi_datasets_names = [e.name for e in ppi_datasets] + + pdi_datasets = models.PDIDataset.objects.all() + pdi_datasets_names = [e.name for e in pdi_datasets] + + parameter_combinations = [] + for protein_interaction_dataset in ppi_datasets_names: + for pdi_dataset in pdi_datasets_names: + parameter_combinations.append((protein_interaction_dataset, pdi_dataset)) + + # close all database connections so subprocesses will create their own connections + # this prevents the processes from running into problems because of using the same connection + db.connections.close_all() + pool = multiprocessing.Pool(KERNEL) + pool.map(create_gt, parameter_combinations) \ No newline at end of file diff --git a/drugstone/management/commands/populate_db.py b/drugstone/management/commands/populate_db.py index 66d878f9d97188eb4c2275f0873451d27f56e2d1..d1083a2ec08eba6957830739799919e65f8d9285 100755 --- a/drugstone/management/commands/populate_db.py +++ b/drugstone/management/commands/populate_db.py @@ -12,6 +12,8 @@ from drugstone.management.includes.NodeCache import NodeCache from drugstone.management.includes import DatasetLoader + + class DatabasePopulator: def __init__(self, data_dir): self.data_dir = data_dir @@ -82,107 +84,114 @@ class Command(BaseCommand): parser.add_argument('-ddi', '--drug_disorder', action='store_true', help='Populate Drug-Disorder Indications') def handle(self, *args, **kwargs): - nedrex_api_url = "http://82.148.225.92:8123/" - data_dir = kwargs['data_dir'] - - db_populator = DatabasePopulator(data_dir=data_dir) - - if kwargs['clear']: - db_populator.delete_all() - - if kwargs['delete_model'] is not None: - model_list = kwargs['delete_model'].split(',') - db_populator.delete_models(model_list) - - cache = NodeCache() - update = True if kwargs['update'] else False - importer = NedrexImporter(nedrex_api_url, cache) - populator = DataPopulator(cache) - - if kwargs['all']: - kwargs['drugs'] = True - kwargs['disorders'] = True - kwargs['proteins'] = True - kwargs['exp'] = True - kwargs['protein_protein'] = True - kwargs['protein_drug'] = True - kwargs['protein_disorder'] = True - kwargs['drug_disorder'] = True - - if kwargs['drugs']: - print('Populating Drugs...') - n = NedrexImporter.import_drugs(importer, update) - print(f'Populated {n} Drugs.') - - if kwargs['disorders']: - print('Populating Disorders...') - n = NedrexImporter.import_disorders(importer, update) - print(f'Populated {n} Disorders.') - - if kwargs['proteins']: - print('Populating Proteins...') - n = NedrexImporter.import_proteins(importer, update) - print(f'Populated {n} Proteins.') - print('Populating ENSG IDs...') - n = DataPopulator.populate_ensg(populator,update) - print(f'Populated {n} ENSG IDs.') - - if kwargs['exp']: - print('Populating Expressions...') - n = DataPopulator.populate_expressions(populator, update) - print(f'Populated {n} Expressions.') - - if kwargs['protein_protein']: - print('Importing PPIs from NeDRexDB...') - n = NedrexImporter.import_protein_protein_interactions(importer, + populate(kwargs) + +def populate(kwargs): + + nedrex_api_url = "http://82.148.225.92:8123/" + data_dir = kwargs['data_dir'] + + db_populator = DatabasePopulator(data_dir=data_dir) + + if kwargs['clear']: + db_populator.delete_all() + + if kwargs['delete_model'] is not None: + model_list = kwargs['delete_model'].split(',') + db_populator.delete_models(model_list) + + cache = NodeCache() + update = True if kwargs['update'] else False + importer = NedrexImporter(nedrex_api_url, cache) + populator = DataPopulator(cache) + + if kwargs['all']: + kwargs['drugs'] = True + kwargs['disorders'] = True + kwargs['proteins'] = True + kwargs['exp'] = True + kwargs['protein_protein'] = True + kwargs['protein_drug'] = True + kwargs['protein_disorder'] = True + kwargs['drug_disorder'] = True + + if kwargs['drugs']: + print('Populating Drugs...') + n = NedrexImporter.import_drugs(importer, update) + print(f'Populated {n} Drugs.') + + if kwargs['disorders']: + print('Populating Disorders...') + n = NedrexImporter.import_disorders(importer, update) + print(f'Populated {n} Disorders.') + + if kwargs['proteins']: + print('Populating Proteins...') + n = NedrexImporter.import_proteins(importer, update) + print(f'Populated {n} Proteins.') + print('Populating ENSG IDs...') + n = DataPopulator.populate_ensg(populator, update) + print(f'Populated {n} ENSG IDs.') + + if kwargs['exp']: + print('Populating Expressions...') + n = DataPopulator.populate_expressions(populator, update) + print(f'Populated {n} Expressions.') + + if kwargs['protein_protein']: + print('Importing PPIs from NeDRexDB...') + n = NedrexImporter.import_protein_protein_interactions(importer, DatasetLoader.get_ppi_nedrex(nedrex_api_url), update) - print(f'Imported {n} PPIs from NeDRexDB') - print('Populating PPIs from STRING...') - n = DataPopulator.populate_ppi_string(populator, DatasetLoader.get_ppi_string(), update) - print(f'Populated {n} PPIs from STRING.') - - print('Populating PPIs from APID...') - n = DataPopulator.populate_ppi_apid(populator, DatasetLoader.get_ppi_apid(), update) - print(f'Populated {n} PPIs from APID.') - - print('Populating PPIs from BioGRID...') - n = DataPopulator.populate_ppi_biogrid(populator, DatasetLoader.get_ppi_biogrid(), update) - print(f'Populated {n} PPIs from BioGRID.') - - if kwargs['protein_drug']: - print('Importing PDIs from NeDRexDB...') - n = NedrexImporter.import_drug_target_interactions(importer, DatasetLoader.get_drug_target_nedrex(nedrex_api_url), update) - print(f'Imported {n} PDIs from NeDRexDB') - - print('Populating PDIs from Chembl...') - n = DataPopulator.populate_pdi_chembl(populator,DatasetLoader.get_drug_target_chembl(), update) - print(f'Populated {n} PDIs from Chembl.') - - print('Populating PDIs from DGIdb...') - n = DataPopulator.populate_pdi_dgidb(populator, DatasetLoader.get_drug_target_dgidb(), update) - print(f'Populated {n} PDIs from DGIdb.') - - print('Populating PDIs from DrugBank...') - n = DataPopulator.populate_pdi_drugbank(populator, DatasetLoader.get_drug_target_drugbank(), update) - print(f'Populated {n} PDIs from DrugBank.') - - if kwargs['protein_disorder']: - print('Importing PDis from NeDRexDB...') - n = NedrexImporter.import_protein_disorder_associations(importer, - DatasetLoader.get_protein_disorder_nedrex(nedrex_api_url), - update) - print(f'Imported {n} PDis from NeDRexDB') - print('Populating PDis associations from DisGeNET...') - n = DataPopulator.populate_pdis_disgenet(populator, DatasetLoader.get_disorder_protein_disgenet(), update) - print(f'Populated {n} PDis associations from DisGeNET.') - - if kwargs['drug_disorder']: - print('Importing DrDis from NeDRexDB...') - n = NedrexImporter.import_drug_disorder_indications(importer, - DatasetLoader.get_drug_disorder_nedrex(nedrex_api_url), - update) - print(f'Imported {n} DrDis from NeDRexDB') - print('Populating DrDi indications from DrugBank...') - n = DataPopulator.populate_drdis_drugbank(populator, DatasetLoader.get_drug_disorder_drugbank(), update) - print(f'Populated {n} DrDi associations from DrugBank.') + print(f'Imported {n} PPIs from NeDRexDB') + print('Populating PPIs from STRING...') + n = DataPopulator.populate_ppi_string(populator, DatasetLoader.get_ppi_string(), update) + print(f'Populated {n} PPIs from STRING.') + + print('Populating PPIs from APID...') + n = DataPopulator.populate_ppi_apid(populator, DatasetLoader.get_ppi_apid(), update) + print(f'Populated {n} PPIs from APID.') + + print('Populating PPIs from BioGRID...') + n = DataPopulator.populate_ppi_biogrid(populator, DatasetLoader.get_ppi_biogrid(), update) + print(f'Populated {n} PPIs from BioGRID.') + + if kwargs['protein_drug']: + print('Importing PDIs from NeDRexDB...') + n = NedrexImporter.import_drug_target_interactions(importer, + DatasetLoader.get_drug_target_nedrex(nedrex_api_url), + update) + print(f'Imported {n} PDIs from NeDRexDB') + + print('Populating PDIs from Chembl...') + n = DataPopulator.populate_pdi_chembl(populator, DatasetLoader.get_drug_target_chembl(), update) + print(f'Populated {n} PDIs from Chembl.') + + print('Populating PDIs from DGIdb...') + n = DataPopulator.populate_pdi_dgidb(populator, DatasetLoader.get_drug_target_dgidb(), update) + print(f'Populated {n} PDIs from DGIdb.') + + print('Populating PDIs from DrugBank...') + n = DataPopulator.populate_pdi_drugbank(populator, DatasetLoader.get_drug_target_drugbank(), update) + print(f'Populated {n} PDIs from DrugBank.') + + if kwargs['protein_disorder']: + print('Importing PDis from NeDRexDB...') + n = NedrexImporter.import_protein_disorder_associations(importer, + DatasetLoader.get_protein_disorder_nedrex( + nedrex_api_url), + update) + print(f'Imported {n} PDis from NeDRexDB') + print('Populating PDis associations from DisGeNET...') + n = DataPopulator.populate_pdis_disgenet(populator, DatasetLoader.get_disorder_protein_disgenet(), update) + print(f'Populated {n} PDis associations from DisGeNET.') + + if kwargs['drug_disorder']: + print('Importing DrDis from NeDRexDB...') + n = NedrexImporter.import_drug_disorder_indications(importer, + DatasetLoader.get_drug_disorder_nedrex(nedrex_api_url), + update) + print(f'Imported {n} DrDis from NeDRexDB') + print('Populating DrDi indications from DrugBank...') + n = DataPopulator.populate_drdis_drugbank(populator, DatasetLoader.get_drug_disorder_drugbank(), update) + print(f'Populated {n} DrDi associations from DrugBank.') diff --git a/drugstone/management/includes/DataPopulator.py b/drugstone/management/includes/DataPopulator.py index 5858202f82fd095b0c879e423a22a0f973015c5f..b31055f423acaccbd27bd79f799d3c34eac1e4a0 100755 --- a/drugstone/management/includes/DataPopulator.py +++ b/drugstone/management/includes/DataPopulator.py @@ -9,19 +9,13 @@ class DataPopulator: self.cache = cache def populate_expressions(self, update): - if update: - models.ExpressionLevel.objects.all().delete() self.cache.init_proteins() df = DataLoader.load_expressions() tissues_models = dict() for tissue_name in df.columns.values[2:]: - try: - tissue_model = models.Tissue.objects.get(name=tissue_name) - except models.Tissue.DoesNotExist: - tissue_model = models.Tissue.objects.create(name=tissue_name) - tissues_models[tissue_name] = tissue_model + tissues_models[tissue_name] = models.Tissue.objects.get_or_create(name=tissue_name) proteins_linked = 0 bulk = set() @@ -33,16 +27,16 @@ class DataPopulator: for protein_model in self.cache.get_proteins_by_gene(gene_name): proteins_linked += 1 - - for tissue_name, tissue_model in tissues_models.items(): - expr = models.ExpressionLevel(protein=protein_model, - tissue=tissue_model, - expression_level=row[tissue_name]) - id = hash(expr) - if id in uniq: - continue - uniq.add(id) - bulk.add(expr) + if not update or self.cache.is_new_protein(protein_model): + for tissue_name, tissue_model in tissues_models.items(): + expr = models.ExpressionLevel(protein=protein_model, + tissue=tissue_model, + expression_level=row[tissue_name]) + id = hash(expr) + if id in uniq: + continue + uniq.add(id) + bulk.add(expr) if len(bulk) > 100000: models.ExpressionLevel.objects.bulk_create(bulk) size += len(bulk) @@ -59,8 +53,6 @@ class DataPopulator: Returns: int: Count of how many ensg-protein relations were added """ - if update: - models.EnsemblGene.objects.all().delete() self.cache.init_proteins() data = DataLoader.load_ensg() bulk = list() @@ -69,7 +61,8 @@ class DataPopulator: proteins = self.cache.get_proteins_by_entrez(entrez) for protein in proteins: for ensg in ensg_list: - bulk.append(models.EnsemblGene(name=ensg, protein=protein)) + if not update or self.cache.is_new_protein(protein): + bulk.append(models.EnsemblGene(name=ensg, protein=protein)) models.EnsemblGene.objects.bulk_create(bulk) return len(bulk) @@ -81,8 +74,6 @@ class DataPopulator: int: Count of how many interactions were added """ self.cache.init_proteins() - if update: - models.ProteinProteinInteraction.objects.filter(ppi_dataset=dataset).delete() df = DataLoader.load_ppi_string() bulk = list() @@ -92,19 +83,15 @@ class DataPopulator: proteins_a = self.cache.get_proteins_by_entrez(row['entrez_a']) proteins_b = self.cache.get_proteins_by_entrez(row['entrez_b']) except KeyError: - # continue if not found continue for protein_a in proteins_a: for protein_b in proteins_b: - try: + if not update or (self.cache.is_new_protein(protein_a) or self.cache.is_new_protein(protein_b)): bulk.append(models.ProteinProteinInteraction( ppi_dataset=dataset, from_protein=protein_a, to_protein=protein_b )) - except models.ValidationError: - # duplicate - continue models.ProteinProteinInteraction.objects.bulk_create(bulk) return len(bulk) @@ -117,8 +104,6 @@ class DataPopulator: """ self.cache.init_proteins() - if update: - models.ProteinProteinInteraction.objects.filter(ppi_dataset=dataset).delete() df = DataLoader.load_ppi_apid() bulk = set() for _, row in df.iterrows(): @@ -129,14 +114,12 @@ class DataPopulator: except KeyError: # continue if not found continue - try: + if not update or (self.cache.is_new_protein(protein_a) or self.cache.is_new_protein(protein_b)): bulk.add(models.ProteinProteinInteraction( ppi_dataset=dataset, from_protein=protein_a, to_protein=protein_b )) - except models.ValidationError: - continue models.ProteinProteinInteraction.objects.bulk_create(bulk) return len(bulk) @@ -149,8 +132,6 @@ class DataPopulator: """ self.cache.init_proteins() - if update: - models.ProteinProteinInteraction.objects.filter(ppi_dataset=dataset).delete() df = DataLoader.load_ppi_biogrid() bulk = list() for _, row in df.iterrows(): @@ -164,15 +145,12 @@ class DataPopulator: continue for protein_a in proteins_a: for protein_b in proteins_b: - try: + if not update or (self.cache.is_new_protein(protein_a) or self.cache.is_new_protein(protein_b)): bulk.append(models.ProteinProteinInteraction( ppi_dataset=dataset, from_protein=protein_a, to_protein=protein_b )) - except models.ValidationError: - # duplicate - continue models.ProteinProteinInteraction.objects.bulk_create(bulk) return len(bulk) @@ -186,8 +164,6 @@ class DataPopulator: self.cache.init_proteins() self.cache.init_drugs() - if update: - models.ProteinDrugInteraction.objects.filter(pdi_dataset=dataset).delete() df = DataLoader.load_pdi_chembl() bulk = set() for _, row in df.iterrows(): @@ -202,11 +178,12 @@ class DataPopulator: except KeyError: # continue if not found continue - bulk.add(models.ProteinDrugInteraction( - pdi_dataset=dataset, - protein=protein, - drug=drug - )) + if not update or (self.cache.is_new_protein(protein) or self.cache.is_new_drug(drug)): + bulk.add(models.ProteinDrugInteraction( + pdi_dataset=dataset, + protein=protein, + drug=drug + )) models.ProteinDrugInteraction.objects.bulk_create(bulk) return len(bulk) @@ -220,9 +197,6 @@ class DataPopulator: self.cache.init_proteins() self.cache.init_disorders() - - if update: - models.ProteinDisorderAssociation.objects.filter(pdis_dataset=dataset).delete() df = DataLoader.load_pdis_disgenet() bulk = set() for _, row in df.iterrows(): @@ -238,12 +212,13 @@ class DataPopulator: except KeyError: # continue if not found continue - bulk.add(models.ProteinDisorderAssociation( - pdis_dataset=dataset, - protein=protein, - disorder=disorder, - score=row['score'] - )) + if not update or (self.cache.is_new_protein(protein) or self.cache.is_new_disease(disorder)): + bulk.add(models.ProteinDisorderAssociation( + pdis_dataset=dataset, + protein=protein, + disorder=disorder, + score=row['score'] + )) models.ProteinDisorderAssociation.objects.bulk_create(bulk) return len(bulk) @@ -256,8 +231,6 @@ class DataPopulator: """ self.cache.init_drugs() self.cache.init_disorders() - if update: - models.DrugDisorderIndication.objects.filter(drdi_dataset=dataset).delete() df = DataLoader.load_drdis_drugbank() bulk = set() @@ -274,11 +247,12 @@ class DataPopulator: except KeyError: # continue if not found continue - bulk.add(models.DrugDisorderIndication( - drdi_dataset=dataset, - drug=drug, - disorder=disorder, - )) + if not update or (self.cache.is_new_drug(drug) or self.cache.is_new_disease(disorder)): + bulk.add(models.DrugDisorderIndication( + drdi_dataset=dataset, + drug=drug, + disorder=disorder, + )) models.DrugDisorderIndication.objects.bulk_create(bulk) return len(bulk) @@ -292,29 +266,24 @@ class DataPopulator: self.cache.init_proteins() self.cache.init_drugs() - if update: - models.ProteinDrugInteraction.objects.filter(pdi_dataset=dataset).delete() df = DataLoader.load_pdi_dgidb() bulk = set() for _, row in df.iterrows(): try: - # try fetching protein proteins = self.cache.get_proteins_by_entrez(row['entrez_id']) except KeyError: - # continue if not found continue try: - # try fetching drug drug = self.cache.get_drug_by_drugbank(row['drug_id']) except KeyError: - # continue if not found continue for protein in proteins: - bulk.add(models.ProteinDrugInteraction( - pdi_dataset=dataset, - protein=protein, - drug=drug - )) + if not update or (self.cache.is_new_protein(protein) or self.cache.is_new_drug(drug)): + bulk.add(models.ProteinDrugInteraction( + pdi_dataset=dataset, + protein=protein, + drug=drug + )) models.ProteinDrugInteraction.objects.bulk_create(bulk) return len(bulk) @@ -328,29 +297,23 @@ class DataPopulator: self.cache.init_proteins() self.cache.init_drugs() - - if update: - models.ProteinDrugInteraction.objects.filter(pdi_dataset=dataset).delete() df = DataLoader.load_pdi_drugbank() bulk = set() for _, row in df.iterrows(): try: - # try fetching protein proteins = self.cache.get_proteins_by_entrez(row['entrez_id']) except KeyError: - # continue if not found continue try: - # try fetching drug drug = self.cache.get_drug_by_drugbank(row['drug_id']) except KeyError: - # continue if not found continue for protein in proteins: - bulk.add(models.ProteinDrugInteraction( - pdi_dataset=dataset, - protein=protein, - drug=drug - )) + if not update or (self.cache.is_new_protein(protein) or self.cache.is_new_drug(drug)): + bulk.add(models.ProteinDrugInteraction( + pdi_dataset=dataset, + protein=protein, + drug=drug + )) models.ProteinDrugInteraction.objects.bulk_create(bulk) return len(bulk) diff --git a/drugstone/management/includes/NodeCache.py b/drugstone/management/includes/NodeCache.py index 7f9491c0e52d16aede95abbf2b1824f2cdcd02aa..5df92d8e46e11bb617fe1acd4afb5c476591cc97 100644 --- a/drugstone/management/includes/NodeCache.py +++ b/drugstone/management/includes/NodeCache.py @@ -10,6 +10,10 @@ class NodeCache: disorders = dict() drugs = dict() + drug_updates = set() + disorder_updates = set() + protein_updates = set() + def init_protein_maps(self): print("Generating protein id maps...") for protein in self.proteins.values(): @@ -20,23 +24,39 @@ class NodeCache: if len(self.proteins) == 0: print("Generating protein maps...") for protein in models.Protein.objects.all(): + if protein.id < 1000: + protein.delete() + continue self.proteins[protein.uniprot_code] = protein if len(self.proteins) > 0 and (len(self.entrez_to_uniprot) == 0 or len(self.gene_name_to_uniprot) == 0): self.init_protein_maps() - def init_drugs(self): if len(self.drugs) == 0: print("Generating drug map...") for drug in models.Drug.objects.all(): + if drug.id < 1000: + drug.delete() + continue self.drugs[drug.drug_id] = drug def init_disorders(self): if len(self.disorders) == 0: print("Generating disorder map...") for disorder in models.Disorder.objects.all(): + if disorder.id < 1000: + disorder.delete() + continue self.disorders[disorder.mondo_id] = disorder + def is_new_protein(self, protein:models.Protein): + return protein.uniprot_code in self.protein_updates + + def is_new_drug(self, drug:models.Drug): + return drug.drug_id in self.drug_updates + + def is_new_disease(self, disease:models.Disorder): + return disease.mondo_id in self.disorder_updates def get_protein_by_uniprot(self,uniprot_id): return self.proteins[uniprot_id] diff --git a/drugstone/models.py b/drugstone/models.py index 5391a43d5383bf0ca5c3c5fbabe248714fada698..b2c7227a32d68bdd8285671e508d12a41f1ffe17 100755 --- a/drugstone/models.py +++ b/drugstone/models.py @@ -84,6 +84,9 @@ class Protein(models.Model): def __ne__(self, other): return not self.__eq__(other) + def __hash__(self): + return hash((self.uniprot_code, self.gene, self.entrez)) + def update(self, other): self.uniprot_code = other.uniprot_code self.gene = other.gene diff --git a/drugstone/settings/celery_schedule.py b/drugstone/settings/celery_schedule.py index de5a78a072248088a319b48a8149e2359a5e9066..b066327c53c3c8f9beb7f969723311d538371221 100644 --- a/drugstone/settings/celery_schedule.py +++ b/drugstone/settings/celery_schedule.py @@ -3,6 +3,6 @@ from celery.schedules import crontab CELERY_BEAT_SCHEDULE = { 'update_db': { 'task': 'drugstone.tasks.task_update_db_from_nedrex', - 'schedule': crontab(minute='*/1'), + 'schedule': crontab(day_of_week=1, hour=5, minute=0), }, } diff --git a/drugstone/tasks.py b/drugstone/tasks.py index 74511629381312164e31a5c8990f90b036fbc25b..03449fcbb517b135f40a14ecda99c2ee40346890 100644 --- a/drugstone/tasks.py +++ b/drugstone/tasks.py @@ -1,18 +1,18 @@ from celery import shared_task from celery.utils.log import get_task_logger -from drugstone.util.nedrex import fetch_nedrex_data, integrate_nedrex_data +from drugstone.management.commands.populate_db import populate +from drugstone.management.commands.make_graphs import run as make_graphs logger = get_task_logger(__name__) +nedrex_api_url = "http://82.148.225.92:8123/" + @shared_task def task_update_db_from_nedrex(): logger.info('Updating DB from NeDRex.') - print('here') - - logger.info('Fetching data...') - # fetch_nedrex_data() - - logger.info('Integrating data...') - # integrate_nedrex_data() + logger.info('Updating data...') + populate({"all": True, "update": True}) + logger.info('Recreating networks...') + make_graphs() logger.info('Done.') diff --git a/scripts/docker-entrypoint.sh b/scripts/docker-entrypoint.sh index a525f29b12a2057849939a4e3f93552038537bc2..142a9f55e7a798546be157b7feb27b32d445a8ed 100755 --- a/scripts/docker-entrypoint.sh +++ b/scripts/docker-entrypoint.sh @@ -5,7 +5,7 @@ file="store/docker-entrypoint.lock" -if ! test -f "$file"; then +#if ! test -f "$file"; then # sh scripts/import-data.sh python3 manage.py makemigrations drugstone python3 manage.py migrate @@ -14,6 +14,6 @@ if ! test -f "$file"; then python3 manage.py populate_db -u --all python3 manage.py make_graphs touch $file -fi +#fi /usr/bin/supervisord -c "/etc/supervisor/conf.d/supervisord.conf"