From 59d8febe149d52ff3af028dbcf30340ad0c1338a Mon Sep 17 00:00:00 2001 From: AndiMajore <andi.majore@googlemail.com> Date: Sat, 9 Jul 2022 02:30:59 +0200 Subject: [PATCH] small update but not all fixed Former-commit-id: 9a544d3f4e0d6988be142678c41e287e4e669685 [formerly a54f02195d5fcb8eaae8a961cd91e5dc5a2305d5] Former-commit-id: 7afc43bd1da646344f8fea1e0e6a3702c4995c72 --- Dockerfile | 1 - docker-compose.yml | 4 +- .../management/commands/import_from_nedrex.py | 8 +- .../management/includes/DatasetLoader.py | 115 +++++++++++++++--- scripts/docker-entrypoint.sh | 2 +- tasks/quick_task.py | 1 + tasks/trust_rank.py | 2 - tasks/util/read_graph_tool_graph.py | 31 +++-- tasks/util/scores_to_results.py | 2 + 9 files changed, 132 insertions(+), 34 deletions(-) diff --git a/Dockerfile b/Dockerfile index 6856398..5bafbf3 100755 --- a/Dockerfile +++ b/Dockerfile @@ -27,7 +27,6 @@ COPY ./supervisord.conf /etc/supervisor/conf.d/supervisord.conf COPY ./python_nedrex/ /usr/src/drugstone/python_nedrex/ RUN pip install /usr/src/drugstone/python_nedrex/ -RUN mkdir store COPY . /usr/src/drugstone/ diff --git a/docker-compose.yml b/docker-compose.yml index 67a5150..a496008 100755 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -14,7 +14,6 @@ services: volumes: - drugstone_db_schema_volume:/usr/src/drugstone/drugstone/migrations - drugstone_data_volume:/usr/src/drugstone/data - - drugstone_store_volume:/usr/src/drugstone/store ports: - 8001:8000 networks: @@ -102,5 +101,4 @@ volumes: drugstone_db_schema_volume: # external: true drugstone_data_volume: -# external: true - drugstone_store_volume: \ No newline at end of file +# external: true \ No newline at end of file diff --git a/drugstone/management/commands/import_from_nedrex.py b/drugstone/management/commands/import_from_nedrex.py index 650b8fc..72a20f3 100644 --- a/drugstone/management/commands/import_from_nedrex.py +++ b/drugstone/management/commands/import_from_nedrex.py @@ -198,7 +198,7 @@ class NedrexImporter: e = models.ProteinDrugInteraction(pdi_dataset=dataset, drug=drug, protein=protein) if not update or e.__hash__() not in existing: bulk.add(e) - for source in edge['databases']: + for source in edge['dataSources']: bulk.add(models.ProteinDrugInteraction(pdi_dataset=source_datasets[source], drug=drug, protein=protein)) except KeyError: pass @@ -237,7 +237,7 @@ class NedrexImporter: e = models.ProteinProteinInteraction(ppi_dataset=dataset, from_protein=protein1, to_protein=protein2) if not update or e.__hash__() not in existing: bulk.append(e) - for source in edge['assertedBy']: + for source in edge['dataSources']: bulk.append( models.ProteinProteinInteraction(ppi_dataset=source_datasets[source], from_protein=protein1, to_protein=protein2)) @@ -268,7 +268,7 @@ class NedrexImporter: score=edge['score']) if not update or e.__hash__() not in existing: bulk.add(e) - for source in edge['assertedBy']: + for source in edge['dataSources']: bulk.add( models.ProteinDisorderAssociation(pdis_dataset=source_datasets[source], protein=protein, disorder=disorder, score=edge['score'])) @@ -298,7 +298,7 @@ class NedrexImporter: e = models.DrugDisorderIndication(drdi_dataset=dataset, drug=drug, disorder=disorder) if not update or e.__hash__() not in existing: bulk.add(e) - for source in edge['assertedBy']: + for source in edge['dataSources']: bulk.add( models.DrugDisorderIndication(drdi_dataset=source_datasets[source], drug=drug, disorder=disorder)) except KeyError: diff --git a/drugstone/management/includes/DatasetLoader.py b/drugstone/management/includes/DatasetLoader.py index 51710f8..8238a6a 100644 --- a/drugstone/management/includes/DatasetLoader.py +++ b/drugstone/management/includes/DatasetLoader.py @@ -1,3 +1,5 @@ +from requests.exceptions import RetryError + from drugstone import models from python_nedrex.static import get_metadata @@ -23,28 +25,46 @@ def get_ppi_apid(): def get_ppi_nedrex_biogrid(url): + version = get_today_version() + try: + version = get_metadata()['source_databases']['biogrid']['date'] + except RetryError: + pass + dataset, _ = models.PPIDataset.objects.get_or_create( name='BioGRID', link=url, - version=get_metadata()['source_databases']['biogrid']['date'] + version=version ) return dataset def get_ppi_nedrex_iid(url): + version = get_today_version() + try: + version = get_metadata()['source_databases']['iid']['date'] + except RetryError: + pass + dataset, _ = models.PPIDataset.objects.get_or_create( name='IID', link=url, - version=get_metadata()['source_databases']['iid']['date'] + version=version ) return dataset def get_ppi_nedrex_intact(url): + version = get_today_version() + try: + version = get_metadata()['source_databases']['intact']['date'] + except RetryError: + pass + dataset, _ = models.PPIDataset.objects.get_or_create( name='IntAct', link=url, - version=get_metadata()['source_databases']['intact']['date'] + version=version ) return dataset @@ -59,37 +79,61 @@ def get_ppi_biogrid(): def get_drug_target_nedrex(url): + version = get_today_version() + try: + version = get_metadata()['version'], + except RetryError: + pass + dataset, _ = models.PDIDataset.objects.get_or_create( name='NeDRex', link=url, - version=get_metadata()['version'], + version=version ) return dataset def get_ppi_nedrex(url): + version = get_today_version() + try: + version = get_metadata()['version'], + except RetryError: + pass + dataset, _ = models.PPIDataset.objects.get_or_create( name='NeDRex', link=url, - version=get_metadata()['version'], + version=version ) return dataset def get_protein_disorder_nedrex(url): + version = get_today_version() + try: + version = get_metadata()['version'], + except RetryError: + pass + dataset, _ = models.PDisDataset.objects.get_or_create( name='NeDRex', link=url, - version=get_metadata()['version'], + version=version ) return dataset def get_drug_disorder_nedrex(url): + version = get_today_version() + try: + version = get_metadata()['version'], + except RetryError: + pass + dataset, _ = models.DrDiDataset.objects.get_or_create( name='NeDRex', link=url, - version=get_metadata()['version'], + version=version ) return dataset @@ -138,62 +182,103 @@ def get_drug_disorder_drugbank(): ) return dataset +def get_today_version(): + import datetime + now = datetime.date.today() + version = f'{now.year}-{now.month}-{now.day}_temp' + return version def get_dis_prot_nedrex_disgenet(url): + version = get_today_version() + try: + version = get_metadata()['source_databases']['disgenet']['date'] + except RetryError: + pass + dataset, _ = models.PDisDataset.objects.get_or_create( name='DisGeNET', link=url, - version=get_metadata()['source_databases']['disgenet']['date'] + version=version ) return dataset def get_dis_prot_nedrex_omim(url): + version = get_today_version() + try: + version = get_metadata()['source_databases']['omim']['date'] + except RetryError: + pass + dataset, _ = models.PDisDataset.objects.get_or_create( name='OMIM', link=url, - version=get_metadata()['source_databases']['omim']['date'] + version=version ) return dataset def get_drdis_nedrex_drugcentral(url): + version = get_today_version() + try: + version = get_metadata()['source_databases']['drug_central']['date'] + except RetryError: + pass + dataset, _ = models.DrDiDataset.objects.get_or_create( name='Drug Central', link=url, - version=get_metadata()['source_databases']['drug_central']['date'] + version=version ) return dataset def get_drdis_nedrex_ctd(url): + version = get_today_version() + try: + version = get_metadata()['source_databases']['ctd']['date'] + except RetryError: + pass + dataset, _ = models.DrDiDataset.objects.get_or_create( name='CTD', link=url, - version=get_metadata()['source_databases']['ctd']['date'] + version=version ) return dataset def get_pdr_nedrex_drugcentral(url): + version = get_today_version() + try: + version = get_metadata()['source_databases']['drug_central']['date'] + except RetryError: + pass + dataset, _ = models.PDIDataset.objects.get_or_create( name='Drug Central', link=url, - version=get_metadata()['source_databases']['drug_central']['date'] + version=version ) return dataset def get_pdr_nedrex_drugbank(url): + version = get_today_version() + try: + version = get_metadata()['source_databases']['drugbank']['date'] + except RetryError: + pass + dataset, _ = models.PDIDataset.objects.get_or_create( name='DrugBank', link=url, - version=get_metadata()['source_databases']['drugbank']['date'] + version=version ) return dataset def get_pdr_nedrex_datasets(url): - return {'DrugBank': get_pdr_nedrex_drugbank(url), 'DrugCentral': get_pdr_nedrex_drugcentral(url)} + return {'drugbank': get_pdr_nedrex_drugbank(url), 'drugcentral': get_pdr_nedrex_drugcentral(url)} def get_drdis_nedrex_datasets(url): - return {'ctd':get_drdis_nedrex_ctd(url), 'Drug Central':get_drdis_nedrex_drugcentral(url)} + return {'ctd':get_drdis_nedrex_ctd(url), 'drugcentral':get_drdis_nedrex_drugcentral(url)} def get_ppi_nedrex_datasets(url): return {'biogrid':get_ppi_nedrex_biogrid(url), 'iid':get_ppi_nedrex_iid(url), 'intact':get_ppi_nedrex_intact(url)} diff --git a/scripts/docker-entrypoint.sh b/scripts/docker-entrypoint.sh index 0245a23..db7a2d3 100755 --- a/scripts/docker-entrypoint.sh +++ b/scripts/docker-entrypoint.sh @@ -4,7 +4,7 @@ python3 manage.py makemigrations drugstone python3 manage.py migrate python3 manage.py createfixtures python3 manage.py cleanuptasks -python3 manage.py populate_db --update -a +#python3 manage.py populate_db --update -a #python3 manage.py make_graphs /usr/bin/supervisord -c "/etc/supervisor/conf.d/supervisord.conf" diff --git a/tasks/quick_task.py b/tasks/quick_task.py index e4a9c5b..21f4cbf 100755 --- a/tasks/quick_task.py +++ b/tasks/quick_task.py @@ -45,6 +45,7 @@ def quick_task(task_hook: TaskHook): "result_size": 20, "include_non_approved_drugs": True, "include_indirect_drugs": False, + "target":"drug" }) tr_task_hook = TaskHook(parameters, task_hook.data_directory, progress, set_result) diff --git a/tasks/trust_rank.py b/tasks/trust_rank.py index 890c6a5..42950ec 100755 --- a/tasks/trust_rank.py +++ b/tasks/trust_rank.py @@ -200,8 +200,6 @@ def trust_rank(task_hook: TaskHook): task_hook.set_progress(0 / 4.0, "Parsing input.") file_path = os.path.join(task_hook.data_directory, f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}.gt") g, seed_ids, drug_ids = read_graph_tool_graph(file_path, seeds, max_deg, include_indirect_drugs, include_non_approved_drugs, search_target) - print(seed_ids) - print(drug_ids) task_hook.set_progress(1 / 4.0, "Computing edge weights.") weights = edge_weights(g, hub_penalty, inverse=True) diff --git a/tasks/util/read_graph_tool_graph.py b/tasks/util/read_graph_tool_graph.py index 1e9caf2..bf26636 100755 --- a/tasks/util/read_graph_tool_graph.py +++ b/tasks/util/read_graph_tool_graph.py @@ -37,7 +37,7 @@ def read_graph_tool_graph(file_path, seeds, max_deg, include_indirect_drugs=Fals The graph indices for all drug nodes """ # Read the graph. - + print(f"loading {file_path} for {target}") g = gt.load_graph(file_path) # g = gtt.extract_largest_component(gg, directed=False, prune=True) # this line is added since we need to work with the LCC of the graphs for all algorithms @@ -47,16 +47,20 @@ def read_graph_tool_graph(file_path, seeds, max_deg, include_indirect_drugs=Fals # Delete all nodes that are not contained in the selected datasets and have degrees higher than max_deg deleted_nodes = [] for node in range(g.num_vertices()): + #Remove all unconnected nodes TODO probably already skip when creating .gt files + if g.vertex(node).out_degree() == 0 and target == 'drug': + deleted_nodes.append(node) # if not g.vertex_properties["name"][node] in set(seeds) and g.vertex(node).out_degree() > max_deg: - if not g.vertex_properties[node_name_attribute][node] in set(seeds) and g.vertex(node).out_degree() > max_deg: + elif not g.vertex_properties[node_name_attribute][node] in set(seeds) and (g.vertex(node).out_degree() > max_deg): deleted_nodes.append(node) # remove all drugs from graph if we are not looking for drugs elif target != 'drug' and g.vertex_properties["type"][node] == d_type: deleted_nodes.append(node) g.remove_vertex(deleted_nodes, fast=True) - # Retrieve internal IDs of seed_ids and viral_protein_ids. + # Retrieve internal IDs of seed_ids seeds = set(seeds) + print(seeds) seed_ids = [] drug_ids = [] is_matched = {protein: False for protein in seeds} @@ -64,7 +68,7 @@ def read_graph_tool_graph(file_path, seeds, max_deg, include_indirect_drugs=Fals node_type = g.vertex_properties["type"][node] if g.vertex_properties[node_name_attribute][node] in seeds: seed_ids.append(node) - is_matched[g.vertex_properties[node_name_attribute][node]] = True + is_matched[g.vertex_properties[node_name_attribute][node]] = node if node_type == d_type: if include_non_approved_drugs: drug_ids.append(node) @@ -74,9 +78,11 @@ def read_graph_tool_graph(file_path, seeds, max_deg, include_indirect_drugs=Fals drug_ids.append(node) # Check that all seed seeds have been matched and throw error, otherwise. + # print(deleted_nodes) + print(seed_ids) for protein, found in is_matched.items(): if not found: - raise ValueError("Invaliddd seed protein {}. No node named {} in {}.".format(protein, protein, file_path)) + raise ValueError("Invalid seed protein {}. No node named {} in {}.".format(protein, protein, file_path)) # Delete edges that should be ignored or are not contained in the selected dataset. deleted_edges = [] @@ -87,8 +93,6 @@ def read_graph_tool_graph(file_path, seeds, max_deg, include_indirect_drugs=Fals direct_drugs.add(edge.target()) elif g.vertex_properties["type"][edge.source()] == d_type and edge.target() in seed_ids: direct_drugs.add(edge.source()) - for drug in direct_drugs: - print(int(drug)) for edge in g.edges(): if g.edge_properties["type"][edge] == 'drug-protein': if g.vertex_properties["type"][edge.target()] == d_type: @@ -113,6 +117,17 @@ def read_graph_tool_graph(file_path, seeds, max_deg, include_indirect_drugs=Fals for edge in deleted_edges: g.remove_edge(edge) g.set_fast_edge_removal(fast=False) - + print("Drugs") + print(drug_ids) + print("Vertices") + vertices = 0 + for _ in g.vertices(): + vertices += 1 + print(f'\t{vertices}') + print("Edges") + edges = 0 + for _ in g.edges(): + edges+=1 + print(f'\t{edges}') # Return the graph and the indices of the seed_ids and the seeds. return g, seed_ids, drug_ids diff --git a/tasks/util/scores_to_results.py b/tasks/util/scores_to_results.py index f09b161..4f47663 100755 --- a/tasks/util/scores_to_results.py +++ b/tasks/util/scores_to_results.py @@ -23,6 +23,7 @@ def scores_to_results( else: candidates = [(node, scores[node]) for node in range(g.num_vertices()) if scores[node] > 0 and node not in set(seed_ids)] best_candidates = [item[0] for item in sorted(candidates, key=lambda item: item[1], reverse=True)[:result_size]] + print(f'Candidate list length: {len(best_candidates)}') # Concatenate best result candidates with seeds and compute induced subgraph. # since the result size filters out nodes, the result network is not complete anymore. @@ -82,6 +83,7 @@ def scores_to_results( for edge in edges: if ((edge.source(), edge.target()) not in returned_edges) or ((edge.target(), edge.source()) not in returned_edges): returned_edges.add((edge.source(), edge.target())) + print(f'Returned nodes number: {len(returned_nodes)}') subgraph = { "nodes": [g.vertex_properties[node_name_attribute][node] for node in returned_nodes], "edges": [{"from": g.vertex_properties[node_name_attribute][source], "to": g.vertex_properties[node_name_attribute][target]} for source, target in returned_edges], -- GitLab