From 56815305d81b02710078b2af3ab4e09f0f497f89 Mon Sep 17 00:00:00 2001 From: AndiMajore <andi.majore@googlemail.com> Date: Wed, 27 Jul 2022 19:18:29 +0200 Subject: [PATCH] all algorithms working for symbol space Former-commit-id: e1e10a4eb4ea5d29f363d583218e58717b4ea711 [formerly 8d18e74f57420d754b5ece6537fe0260468e7445] Former-commit-id: aa0d2d2290e67a30b44018c517ba073e5c3dd153 --- tasks/closeness_centrality.py | 20 +++++++----- tasks/network_proximity.py | 2 +- tasks/quick_task.py | 49 +++++++---------------------- tasks/trust_rank.py | 4 +-- tasks/util/read_graph_tool_graph.py | 25 +++++++-------- tasks/util/scores_to_results.py | 2 -- 6 files changed, 39 insertions(+), 63 deletions(-) diff --git a/tasks/closeness_centrality.py b/tasks/closeness_centrality.py index 7ae2772..1c306ee 100755 --- a/tasks/closeness_centrality.py +++ b/tasks/closeness_centrality.py @@ -1,3 +1,4 @@ +import numpy as np from tasks.util.read_graph_tool_graph import read_graph_tool_graph from tasks.util.scores_to_results import scores_to_results from tasks.util.edge_weights import edge_weights @@ -117,7 +118,6 @@ def closeness_centrality(task_hook: TaskHook): # Reasonable default: False. # Has no effect unless trust_rank.py is used for ranking drugs. include_indirect_drugs = task_hook.parameters.get("include_indirect_drugs", False) - # Type: bool # Semantics: Sepcifies whether should be included in the analysis when ranking drugs. # Example: False. @@ -164,7 +164,7 @@ def closeness_centrality(task_hook: TaskHook): pdi_dataset = task_hook.parameters.get("pdi_dataset") - search_target = task_hook.parameters.get("target", "drug-target") + search_target = task_hook.parameters.get("target", "drug") filterPaths = task_hook.parameters.get("filter_paths", True) @@ -173,11 +173,12 @@ def closeness_centrality(task_hook: TaskHook): id_space = task_hook.parameters["config"].get("identifier", "symbol") + node_name_attribute = "internal_id" + filename = f"{id_space}_{ppi_dataset['name']}-{pdi_dataset['name']}" if ppi_dataset['licenced'] or pdi_dataset['licenced']: filename += "_licenced" filename = os.path.join(task_hook.data_directory, filename + ".gt") - # g, seed_ids, viral_protein_ids, drug_ids = read_graph_tool_graph(file_path, seeds, datasets, ignored_edge_types, max_deg, ignore_non_seed_baits, include_indirect_drugs, include_non_approved_drugs) g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, id_space, max_deg, include_indirect_drugs, include_non_approved_drugs, search_target) task_hook.set_progress(1 / 4.0, "Computing edge weights.") weights = edge_weights(g, hub_penalty) @@ -189,12 +190,15 @@ def closeness_centrality(task_hook: TaskHook): # Call graph-tool to compute TrustRank. task_hook.set_progress(2 / 4.0, "Computing shortest path closeness centralities.") all_dists = [] + # score_nodes = drug_ids if search_target == 'drug' else seed_ids for node in seed_ids: - all_dists.append(gtt.shortest_distance(g, node, weights=weights)) - scores = len(seeds) / sum([dists.get_array() for dists in all_dists]) - + dists = gtt.shortest_distance(g, node, weights=weights).get_array() + dists[dists == np.inf] = 99999999999 + all_dists.append(dists+1) + + scores = len(seed_ids) / (sum([dists for dists in all_dists])) + # Compute and return the results. - task_hook.set_progress(3 / 4.0, "Formating results.") - # task_hook.set_results(scores_to_results(strain_or_drugs, result_size, g, seed_ids, viral_protein_ids, drug_ids, scores)) + task_hook.set_progress(3 / 4.0, "Formatting results.") task_hook.set_results(scores_to_results(search_target, result_size, g, seed_ids, drug_ids, scores, ppi_dataset, pdi_dataset, filterPaths)) diff --git a/tasks/network_proximity.py b/tasks/network_proximity.py index 716c7a3..c07beee 100755 --- a/tasks/network_proximity.py +++ b/tasks/network_proximity.py @@ -79,7 +79,7 @@ def network_proximity(task_hook: TaskHook): filter_paths = task_hook.parameters.get("filter_paths", True) - node_name_attribute = "drugstone_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute + node_name_attribute = "internal_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute # Set number of threads if OpenMP support is enabled. if gt.openmp_enabled(): gt.openmp_set_num_threads(num_threads) diff --git a/tasks/quick_task.py b/tasks/quick_task.py index 21f4cbf..899674d 100755 --- a/tasks/quick_task.py +++ b/tasks/quick_task.py @@ -1,25 +1,14 @@ from tasks.task_hook import TaskHook - -def infer_node_type(node): # TODO: This needs to be improved - if len(node) == 6 or len(node) == 10: - return 'protein' - # if node.startswith('DB'): - # return 'drug' - # return 'virus' - if node.startswith('DB'): - return 'drug' - return 'protein' - - def quick_task(task_hook: TaskHook): - def run_closeness(parameters): + def run_closeness(parameters, network): from .closeness_centrality import closeness_centrality def closeness_progress(progress, status): task_hook.set_progress(2 / 3 + 1 / 3 * progress, status) def closeness_set_result(result): + result["network"]["edges"].extend(network["edges"]) task_hook.set_results(result) # Prepare intermediate hook @@ -31,25 +20,6 @@ def quick_task(task_hook: TaskHook): # Run closeness centrality closeness_centrality(closeness_task_hook) - def run_trust_rank(parameters, seeds): - from .trust_rank import trust_rank - - def progress(progress, status): - task_hook.set_progress(2 / 3 + 1 / 3 * progress, status) - - def set_result(result): - task_hook.set_results(result) - - parameters.update({ - "seeds": seeds, - "result_size": 20, - "include_non_approved_drugs": True, - "include_indirect_drugs": False, - "target":"drug" - }) - - tr_task_hook = TaskHook(parameters, task_hook.data_directory, progress, set_result) - trust_rank(tr_task_hook) def run_multi_steiner(parameters): from .multi_steiner import multi_steiner @@ -60,15 +30,20 @@ def quick_task(task_hook: TaskHook): def ms_set_result(result): node_attributes = result.get("node_attributes", {}) node_types = node_attributes.get("node_types", {}) - # seeds = [seed for seed in result["network"]["nodes"] if node_types.get(seed) == 'host' or - # (not node_types.get(seed) and infer_node_type(seed) == 'host')] - seeds = [seed for seed in result["network"]["nodes"] if node_types.get(seed) == 'protein' or - (not node_types.get(seed) and infer_node_type(seed) == 'protein')] + seeds = [seed for seed in result["network"]["nodes"] if node_types.get(seed) == 'protein'] + if len(seeds) == 0: task_hook.set_results({"network": {"nodes": [], "edges": []}}) return - run_trust_rank(parameters, seeds) + parameters.update({ + "seeds": seeds, + "result_size": 10, + "hub_penalty": 1, + "target": "drug", + "include_non_approved_drugs": True + }) + run_closeness(parameters, result["network"]) parameters["num_trees"] = 1 parameters["hub_penalty"] = 1 diff --git a/tasks/trust_rank.py b/tasks/trust_rank.py index ec75373..6922fa6 100755 --- a/tasks/trust_rank.py +++ b/tasks/trust_rank.py @@ -218,8 +218,8 @@ def trust_rank(task_hook: TaskHook): trust = g.new_vertex_property("double") trust.a[seed_ids] = 1.0 / len(seed_ids) scores = gtc.pagerank(g, damping=damping_factor, pers=trust, weight=weights) - # Compute and return the results. task_hook.set_progress(3 / 4.0, "Formating results.") # Convert results to useful output and save it - task_hook.set_results(scores_to_results(search_target, result_size, g, seed_ids, drug_ids, scores, ppi_dataset, pdi_dataset, filter_paths)) + results = scores_to_results(search_target, result_size, g, seed_ids, drug_ids, scores, ppi_dataset, pdi_dataset, filter_paths) + task_hook.set_results(results) diff --git a/tasks/util/read_graph_tool_graph.py b/tasks/util/read_graph_tool_graph.py index 35a89c4..2db32cf 100755 --- a/tasks/util/read_graph_tool_graph.py +++ b/tasks/util/read_graph_tool_graph.py @@ -51,7 +51,6 @@ def read_graph_tool_graph(file_path, seeds, id_space, max_deg, include_indirect_ # Remove all unconnected nodes TODO probably already skip when creating .gt files if g.vertex(node).out_degree() == 0 and target == 'drug': deleted_nodes.append(node) - # if not g.vertex_properties["name"][node] in set(seeds) and g.vertex(node).out_degree() > max_deg: elif not g.vertex_properties[node_name_attribute][node] in set(seeds) and ( g.vertex(node).out_degree() > max_deg): deleted_nodes.append(node) @@ -74,8 +73,8 @@ def read_graph_tool_graph(file_path, seeds, id_space, max_deg, include_indirect_ if include_non_approved_drugs: drug_ids.append(node) else: - drug_groups = g.vertex_properties["status"][node].split(', ') - if "approved" in drug_groups: + # drug_groups = g.vertex_properties["status"][node].split(', ') + if "approved" in g.vertex_properties["status"][node]: drug_ids.append(node) # Delete edges that should be ignored or are not contained in the selected dataset. @@ -111,26 +110,26 @@ def read_graph_tool_graph(file_path, seeds, id_space, max_deg, include_indirect_ if indir_drug and int(edge.target()) in drug_ids: drug_ids.remove(int(edge.target())) - elif g.vertex_properties["type"][ - edge.source()] == d_type and edge.source() not in direct_drugs or edge.target() not in seed_ids: + elif g.vertex_properties["type"][edge.source()] == d_type and \ + edge.source() not in direct_drugs or edge.target() not in seed_ids: indir_drug = edge.source() not in direct_drugs not_seed = edge.target() not in seed_ids if indir_drug or not_seed: deleted_edges.append(edge) if indir_drug and int(edge.source()) in drug_ids: drug_ids.remove(int(edge.source())) - else: - deleted_edges.append(edge) + # else: + # deleted_edges.append(edge) g.set_fast_edge_removal(fast=True) for edge in deleted_edges: g.remove_edge(edge) g.set_fast_edge_removal(fast=False) - vertices = 0 - for _ in g.vertices(): - vertices += 1 - edges = 0 - for _ in g.edges(): - edges += 1 + # vertices = 0 + # for _ in g.vertices(): + # vertices += 1 + # edges = 0 + # for _ in g.edges(): + # edges += 1 # Return the graph and the indices of the seed_ids and the seeds. return g, list(seed_ids.keys()), drug_ids diff --git a/tasks/util/scores_to_results.py b/tasks/util/scores_to_results.py index e3b679e..4db5e69 100755 --- a/tasks/util/scores_to_results.py +++ b/tasks/util/scores_to_results.py @@ -16,8 +16,6 @@ def scores_to_results( r"""Transforms the scores to the required result format.""" node_name_attribute = "internal_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute - candidates = [] - # if strain_or_drugs == "drugs": if target == "drug": candidates = [(node, scores[node]) for node in drug_ids if scores[node] > 0] else: -- GitLab