From 185bd346189b4bd6029ae26b9d52e29bce6ef4b0 Mon Sep 17 00:00:00 2001
From: AndiMajore <andi.majore@googlemail.com>
Date: Wed, 27 Jul 2022 19:18:29 +0200
Subject: [PATCH] all algorithms working for symbol space

---
 tasks/closeness_centrality.py       | 20 +++++++-----
 tasks/network_proximity.py          |  2 +-
 tasks/quick_task.py                 | 49 +++++++----------------------
 tasks/trust_rank.py                 |  4 +--
 tasks/util/read_graph_tool_graph.py | 25 +++++++--------
 tasks/util/scores_to_results.py     |  2 --
 6 files changed, 39 insertions(+), 63 deletions(-)

diff --git a/tasks/closeness_centrality.py b/tasks/closeness_centrality.py
index 7ae2772..1c306ee 100755
--- a/tasks/closeness_centrality.py
+++ b/tasks/closeness_centrality.py
@@ -1,3 +1,4 @@
+import numpy as np
 from tasks.util.read_graph_tool_graph import read_graph_tool_graph
 from tasks.util.scores_to_results import scores_to_results
 from tasks.util.edge_weights import edge_weights
@@ -117,7 +118,6 @@ def closeness_centrality(task_hook: TaskHook):
     # Reasonable default: False.
     # Has no effect unless trust_rank.py is used for ranking drugs.
     include_indirect_drugs = task_hook.parameters.get("include_indirect_drugs", False)
-    
     # Type: bool
     # Semantics: Sepcifies whether should be included in the analysis when ranking drugs.
     # Example: False.
@@ -164,7 +164,7 @@ def closeness_centrality(task_hook: TaskHook):
 
     pdi_dataset = task_hook.parameters.get("pdi_dataset")
 
-    search_target = task_hook.parameters.get("target", "drug-target")
+    search_target = task_hook.parameters.get("target", "drug")
 
     filterPaths = task_hook.parameters.get("filter_paths", True)
     
@@ -173,11 +173,12 @@ def closeness_centrality(task_hook: TaskHook):
 
     id_space = task_hook.parameters["config"].get("identifier", "symbol")
 
+    node_name_attribute = "internal_id"
+
     filename = f"{id_space}_{ppi_dataset['name']}-{pdi_dataset['name']}"
     if ppi_dataset['licenced'] or pdi_dataset['licenced']:
         filename += "_licenced"
     filename = os.path.join(task_hook.data_directory, filename + ".gt")
-    # g, seed_ids, viral_protein_ids, drug_ids = read_graph_tool_graph(file_path, seeds, datasets, ignored_edge_types, max_deg, ignore_non_seed_baits, include_indirect_drugs, include_non_approved_drugs)
     g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, id_space, max_deg, include_indirect_drugs, include_non_approved_drugs, search_target)
     task_hook.set_progress(1 / 4.0, "Computing edge weights.")
     weights = edge_weights(g, hub_penalty) 
@@ -189,12 +190,15 @@ def closeness_centrality(task_hook: TaskHook):
     # Call graph-tool to compute TrustRank.
     task_hook.set_progress(2 / 4.0, "Computing shortest path closeness centralities.")
     all_dists = []
+    # score_nodes = drug_ids if search_target == 'drug' else seed_ids
     for node in seed_ids:
-        all_dists.append(gtt.shortest_distance(g, node, weights=weights))
-    scores = len(seeds) / sum([dists.get_array() for dists in all_dists])
-    
+        dists = gtt.shortest_distance(g, node, weights=weights).get_array()
+        dists[dists == np.inf] = 99999999999
+        all_dists.append(dists+1)
+
+    scores = len(seed_ids) / (sum([dists for dists in all_dists]))
+
     # Compute and return the results.
-    task_hook.set_progress(3 / 4.0, "Formating results.")
-    # task_hook.set_results(scores_to_results(strain_or_drugs, result_size, g, seed_ids, viral_protein_ids, drug_ids, scores))
+    task_hook.set_progress(3 / 4.0, "Formatting results.")
     task_hook.set_results(scores_to_results(search_target, result_size, g, seed_ids, drug_ids, scores, ppi_dataset, pdi_dataset, filterPaths))
 
diff --git a/tasks/network_proximity.py b/tasks/network_proximity.py
index 716c7a3..c07beee 100755
--- a/tasks/network_proximity.py
+++ b/tasks/network_proximity.py
@@ -79,7 +79,7 @@ def network_proximity(task_hook: TaskHook):
 
     filter_paths = task_hook.parameters.get("filter_paths", True)
 
-    node_name_attribute = "drugstone_id"  # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute
+    node_name_attribute = "internal_id"  # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute
     # Set number of threads if OpenMP support is enabled.
     if gt.openmp_enabled():
         gt.openmp_set_num_threads(num_threads)
diff --git a/tasks/quick_task.py b/tasks/quick_task.py
index 21f4cbf..899674d 100755
--- a/tasks/quick_task.py
+++ b/tasks/quick_task.py
@@ -1,25 +1,14 @@
 from tasks.task_hook import TaskHook
 
-
-def infer_node_type(node):  # TODO: This needs to be improved
-    if len(node) == 6 or len(node) == 10:
-        return 'protein'
-    # if node.startswith('DB'):
-    #     return 'drug'
-    # return 'virus'
-    if node.startswith('DB'):
-        return 'drug'
-    return 'protein'
-
-
 def quick_task(task_hook: TaskHook):
-    def run_closeness(parameters):
+    def run_closeness(parameters, network):
         from .closeness_centrality import closeness_centrality
 
         def closeness_progress(progress, status):
             task_hook.set_progress(2 / 3 + 1 / 3 * progress, status)
 
         def closeness_set_result(result):
+            result["network"]["edges"].extend(network["edges"])
             task_hook.set_results(result)
 
         # Prepare intermediate hook
@@ -31,25 +20,6 @@ def quick_task(task_hook: TaskHook):
         # Run closeness centrality
         closeness_centrality(closeness_task_hook)
 
-    def run_trust_rank(parameters, seeds):
-        from .trust_rank import trust_rank
-
-        def progress(progress, status):
-            task_hook.set_progress(2 / 3 + 1 / 3 * progress, status)
-
-        def set_result(result):
-            task_hook.set_results(result)
-
-        parameters.update({
-            "seeds": seeds,
-            "result_size": 20,
-            "include_non_approved_drugs": True,
-            "include_indirect_drugs": False,
-            "target":"drug"
-        })
-
-        tr_task_hook = TaskHook(parameters, task_hook.data_directory, progress, set_result)
-        trust_rank(tr_task_hook)
 
     def run_multi_steiner(parameters):
         from .multi_steiner import multi_steiner
@@ -60,15 +30,20 @@ def quick_task(task_hook: TaskHook):
         def ms_set_result(result):
             node_attributes = result.get("node_attributes", {})
             node_types = node_attributes.get("node_types", {})
-            # seeds = [seed for seed in result["network"]["nodes"] if node_types.get(seed) == 'host' or
-            #          (not node_types.get(seed) and infer_node_type(seed) == 'host')]
-            seeds = [seed for seed in result["network"]["nodes"] if node_types.get(seed) == 'protein' or
-                     (not node_types.get(seed) and infer_node_type(seed) == 'protein')]
+            seeds = [seed for seed in result["network"]["nodes"] if node_types.get(seed) == 'protein']
+
             if len(seeds) == 0:
                 task_hook.set_results({"network": {"nodes": [], "edges": []}})
                 return
 
-            run_trust_rank(parameters, seeds)
+            parameters.update({
+                "seeds": seeds,
+                "result_size": 10,
+                "hub_penalty": 1,
+                "target": "drug",
+                "include_non_approved_drugs": True
+            })
+            run_closeness(parameters, result["network"])
 
         parameters["num_trees"] = 1
         parameters["hub_penalty"] = 1
diff --git a/tasks/trust_rank.py b/tasks/trust_rank.py
index ec75373..6922fa6 100755
--- a/tasks/trust_rank.py
+++ b/tasks/trust_rank.py
@@ -218,8 +218,8 @@ def trust_rank(task_hook: TaskHook):
     trust = g.new_vertex_property("double")
     trust.a[seed_ids] = 1.0 / len(seed_ids)
     scores = gtc.pagerank(g, damping=damping_factor, pers=trust, weight=weights)
-    
     # Compute and return the results.
     task_hook.set_progress(3 / 4.0, "Formating results.")
     # Convert results to useful output and save it
-    task_hook.set_results(scores_to_results(search_target, result_size, g, seed_ids, drug_ids, scores, ppi_dataset, pdi_dataset, filter_paths))
+    results = scores_to_results(search_target, result_size, g, seed_ids, drug_ids, scores, ppi_dataset, pdi_dataset, filter_paths)
+    task_hook.set_results(results)
diff --git a/tasks/util/read_graph_tool_graph.py b/tasks/util/read_graph_tool_graph.py
index 35a89c4..2db32cf 100755
--- a/tasks/util/read_graph_tool_graph.py
+++ b/tasks/util/read_graph_tool_graph.py
@@ -51,7 +51,6 @@ def read_graph_tool_graph(file_path, seeds, id_space, max_deg, include_indirect_
         # Remove all unconnected nodes TODO probably already skip when creating .gt files
         if g.vertex(node).out_degree() == 0 and target == 'drug':
             deleted_nodes.append(node)
-        # if not g.vertex_properties["name"][node] in set(seeds) and g.vertex(node).out_degree() > max_deg:
         elif not g.vertex_properties[node_name_attribute][node] in set(seeds) and (
                 g.vertex(node).out_degree() > max_deg):
             deleted_nodes.append(node)
@@ -74,8 +73,8 @@ def read_graph_tool_graph(file_path, seeds, id_space, max_deg, include_indirect_
             if include_non_approved_drugs:
                 drug_ids.append(node)
             else:
-                drug_groups = g.vertex_properties["status"][node].split(', ')
-                if "approved" in drug_groups:
+                # drug_groups = g.vertex_properties["status"][node].split(', ')
+                if "approved" in g.vertex_properties["status"][node]:
                     drug_ids.append(node)
 
     # Delete edges that should be ignored or are not contained in the selected dataset.
@@ -111,26 +110,26 @@ def read_graph_tool_graph(file_path, seeds, id_space, max_deg, include_indirect_
                     if indir_drug and int(edge.target()) in drug_ids:
                         drug_ids.remove(int(edge.target()))
 
-                elif g.vertex_properties["type"][
-                    edge.source()] == d_type and edge.source() not in direct_drugs or edge.target() not in seed_ids:
+                elif g.vertex_properties["type"][edge.source()] == d_type and \
+                        edge.source() not in direct_drugs or edge.target() not in seed_ids:
                     indir_drug = edge.source() not in direct_drugs
                     not_seed = edge.target() not in seed_ids
                     if indir_drug or not_seed:
                         deleted_edges.append(edge)
                     if indir_drug and int(edge.source()) in drug_ids:
                         drug_ids.remove(int(edge.source()))
-            else:
-                deleted_edges.append(edge)
+            # else:
+            #     deleted_edges.append(edge)
 
     g.set_fast_edge_removal(fast=True)
     for edge in deleted_edges:
         g.remove_edge(edge)
     g.set_fast_edge_removal(fast=False)
-    vertices = 0
-    for _ in g.vertices():
-        vertices += 1
-    edges = 0
-    for _ in g.edges():
-        edges += 1
+    # vertices = 0
+    # for _ in g.vertices():
+    #     vertices += 1
+    # edges = 0
+    # for _ in g.edges():
+    #     edges += 1
     # Return the graph and the indices of the seed_ids and the seeds.
     return g, list(seed_ids.keys()), drug_ids
diff --git a/tasks/util/scores_to_results.py b/tasks/util/scores_to_results.py
index e3b679e..4db5e69 100755
--- a/tasks/util/scores_to_results.py
+++ b/tasks/util/scores_to_results.py
@@ -16,8 +16,6 @@ def scores_to_results(
     r"""Transforms the scores to the required result format."""
 
     node_name_attribute = "internal_id"  # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute
-    candidates = []
-    # if strain_or_drugs == "drugs":
     if target == "drug":
         candidates = [(node, scores[node]) for node in drug_ids if scores[node] > 0]
     else:
-- 
GitLab