From efe65a77b24bbc34a8e9e15a30c36583c836f494 Mon Sep 17 00:00:00 2001
From: AndiMajore <andi.majore@googlemail.com>
Date: Tue, 26 Jul 2022 00:09:44 +0200
Subject: [PATCH] adjusted id-space wise analysis preparation and execution

Former-commit-id: fb4d5146205053e00478b0de87642b09d692bc5a [formerly 45a041e729435626ada319eab340b8d962993623]
Former-commit-id: ed383169c0bf47eb77541dfc30c69e1894350858
---
 drugstone/management/commands/make_graphs.py | 112 ++++++++++++++-----
 drugstone/management/commands/populate_db.py |  28 ++---
 drugstone/views.py                           |  83 +++++++-------
 tasks/betweenness_centrality.py              |   8 +-
 tasks/closeness_centrality.py                |   8 +-
 tasks/degree_centrality.py                   |   8 +-
 tasks/keypathwayminer_task.py                |  17 ++-
 tasks/multi_steiner.py                       |   7 +-
 tasks/network_proximity.py                   |   4 +-
 tasks/trust_rank.py                          |   2 +-
 tasks/util/find_bridges.py                   |   2 +-
 tasks/util/read_graph_tool_graph.py          |  27 +----
 tasks/util/scores_to_results.py              |  22 ++--
 tasks/util/steiner_tree.py                   |   2 +-
 14 files changed, 192 insertions(+), 138 deletions(-)

diff --git a/drugstone/management/commands/make_graphs.py b/drugstone/management/commands/make_graphs.py
index 0ff9bb3..8794f48 100755
--- a/drugstone/management/commands/make_graphs.py
+++ b/drugstone/management/commands/make_graphs.py
@@ -1,3 +1,4 @@
+from collections import defaultdict
 from typing import List, Tuple
 import graph_tool.all as gt
 from drugstone import models
@@ -77,7 +78,7 @@ def _internal_ppis(dataset) -> List[models.ProteinProteinInteraction]:
     return node_node_interaction_objects
 
 
-def create_gt(params: Tuple) -> None:
+def create_gt(params: List[str]) -> None:
     """Fetches all required information to build a graph-tools file for given
     PPI and PDI dataset names (params). Builds the graph-tools file and saves it in 
     the data/Networks folder.
@@ -85,37 +86,31 @@ def create_gt(params: Tuple) -> None:
     Args:
         params (Tuple[str, str]): Protein-protein-dataset name, Protein-drug-dataset name
     """
-    ppi_dataset, pdi_dataset = params
+    ppi_dataset, pdi_dataset, identifier = params
+
     licensed = ppi_dataset.licenced or pdi_dataset.licenced
     # get data from api
 
     g = gt.Graph(directed=False)
+
     e_type = g.new_edge_property("string")
 
     v_type = g.new_vertex_property("string")
     v_name = g.new_vertex_property("string")
-    v_drugstone_id = g.new_vertex_property("string")
-    v_has_symbol = g.new_vertex_property("bool")
-    v_has_entrez = g.new_vertex_property("bool")
-    v_has_ensembl = g.new_vertex_property("bool")
-    v_expression = g.new_vertex_property("string")
 
     # for drugs
     v_status = g.new_vertex_property("string")
     v_drug_id = g.new_vertex_property("string")
+    v_internal_id = g.new_vertex_property("string")
 
     g.edge_properties["type"] = e_type
-    g.edge_properties["drugstone_id"] = e_type
+    # g.edge_properties["drugstone_id"] = e_type
 
     g.vertex_properties["type"] = v_type
     g.vertex_properties["name"] = v_name
-    g.vertex_properties["drugstone_id"] = v_drugstone_id
-    g.vertex_properties["has_symbol"] = v_has_symbol
-    g.vertex_properties["has_entrez"] = v_has_entrez
-    g.vertex_properties["has_ensembl"] = v_has_ensembl
     g.vertex_properties["status"] = v_status
     g.vertex_properties["drug_id"] = v_drug_id
-    g.vertex_properties["expression"] = v_expression
+    g.vertex_properties["internal_id"] = v_internal_id
 
     # store nodes to connect them when creating edges
     vertices = {}
@@ -123,21 +118,46 @@ def create_gt(params: Tuple) -> None:
     # add vertices
 
     # print("adding nodes")
-    print(f'loading nodes')
+    print(f'loading nodes for {identifier}')
     # extend node data by cancer nodes, we create a normal node for each cancer node.
     # on reading the data, we decide which one to keep based on the user selected cancer types
 
-    has_ensembl_set = {node.protein_id for node in models.EnsemblGene.objects.all()}
+    is_entrez = identifier == 'entrez'
+    is_symbol = identifier == 'symbol'
+    is_uniprot = identifier == 'uniprot'
+    is_ensg = identifier == 'ensg'
+
+    if is_ensg:
+        ensembl_set = defaultdict(set)
+        for node in models.EnsemblGene.objects.all():
+            ensembl_set[node.protein_id].add(node.name)
+
+    node_id_map = defaultdict(set)
+    drugstone_ids_to_node_ids = defaultdict(set)
 
     for node in models.Protein.objects.all():
+        if is_entrez:
+            if len(node.entrez) != 0:
+                node_id_map[node.entrez].add(node.id)
+                drugstone_ids_to_node_ids[node.id].add(node.entrez)
+        elif is_symbol:
+            if len(node.gene) != 0:
+                node_id_map[node.gene].add(node.id)
+                drugstone_ids_to_node_ids[node.id].add(node.gene)
+        elif is_uniprot:
+            node_id_map[node.uniprot_code].add(node.id)
+            drugstone_ids_to_node_ids[node.id].add(node.uniprot_code)
+        elif is_ensg:
+            for id in ensembl_set[node.id]:
+                node_id_map[id].add(node.id)
+                drugstone_ids_to_node_ids[node.id].add(id)
+
+    for id, nodes in node_id_map.items():
         v = g.add_vertex()
         v_type[v] = 'protein'
-        v_drugstone_id[v] = f"p{node.id}"
-        v_has_symbol[v] = len(node.gene) != 0
-        v_has_entrez[v] = len(node.entrez) != 0
-        v_has_ensembl[v] = node.id in has_ensembl_set
-        vertices[node.id] = v
-
+        v_internal_id[v] = id
+        for drugstone_id in nodes:
+            vertices[drugstone_id] = v
     print("done with nodes")
 
     print(f"adding drugs")
@@ -145,22 +165,42 @@ def create_gt(params: Tuple) -> None:
         v = g.add_vertex()
         v_type[v] = 'drug'
         v_status[v] = node.status
-        v_drugstone_id[v] = f'dr{node.id}'
+        v_internal_id[v] = f'dr{node.id}'
 
         drug_vertices[node.id] = v
+
     print("done with drugs")
 
     # add edges
     print(f'adding ppi_edges/{ppi_dataset}')
+
+    uniq_edges = set()
+
     for edge_raw in _internal_ppis(ppi_dataset):
-        e = g.add_edge(vertices[edge_raw.from_protein_id], vertices[edge_raw.to_protein_id])
-        e_type[e] = 'protein-protein'
+        id1 = edge_raw.from_protein_id
+        id2 = edge_raw.to_protein_id
+        if id1 > id2:
+            tmp = id1
+            id1 = id2
+            id2 = tmp
+        hash = f'{id1}_{id2}'
+        if hash not in uniq_edges and id1 in vertices and id2 in vertices:
+            uniq_edges.add(hash)
+            e = g.add_edge(vertices[id1], vertices[id2])
+            e_type[e] = 'protein-protein'
     print("done with edges")
 
+    uniq_edges = set()
+
     print(f'loading drug_edges/{pdi_dataset}')
     for edge_raw in _internal_pdis(pdi_dataset):
-        e = g.add_edge(drug_vertices[edge_raw.drug_id], vertices[edge_raw.protein_id])
-        e_type[e] = 'drug-protein'
+        id1 = edge_raw.drug_id
+        id2 = edge_raw.protein_id
+        hash = f'{id1}_{id2}'
+        if hash not in uniq_edges and id1 in drug_vertices and id2 in vertices:
+            uniq_edges.add(hash)
+            e = g.add_edge(drug_vertices[id1], vertices[id2])
+            e_type[e] = 'drug-protein'
     print("done with drug edges")
 
     # remove unconnected proteins
@@ -177,7 +217,7 @@ def create_gt(params: Tuple) -> None:
     g.remove_vertex(reversed(sorted(delete_vertices)), fast=True)
 
     # save graph
-    filename = f"./data/Networks/internal_{ppi_dataset.name}_{pdi_dataset.name}"
+    filename = f"./data/Networks/{identifier}_{ppi_dataset.name}-{pdi_dataset.name}"
     if licensed:
         filename += "_licenced"
     filename += ".gt"
@@ -195,11 +235,25 @@ class Command(BaseCommand):
 
         pdi_datasets = models.PDIDataset.objects.all()
 
+        licenced_ppi_dataset = {ppi.name: ppi for ppi in ppi_datasets if ppi.licenced}
+        licenced_pdi_dataset = {pdi.name: pdi for pdi in pdi_datasets if pdi.licenced}
+
+        uniq_combis = set()
         parameter_combinations = []
         for protein_interaction_dataset in ppi_datasets:
             for pdi_dataset in pdi_datasets:
-                parameter_combinations.append((protein_interaction_dataset, pdi_dataset))
-
+                licenced = protein_interaction_dataset.licenced or pdi_dataset.licenced
+                if licenced:
+                    protein_interaction_dataset = licenced_ppi_dataset[
+                        protein_interaction_dataset.name] if protein_interaction_dataset.name in licenced_ppi_dataset else protein_interaction_dataset
+                    pdi_dataset = licenced_pdi_dataset[
+                        pdi_dataset.name] if pdi_dataset.name in licenced_pdi_dataset else pdi_dataset
+                hash = f'{protein_interaction_dataset.name}-{pdi_dataset.name}_{licenced}'
+                if hash in uniq_combis:
+                    continue
+                uniq_combis.add(hash)
+                for identifier in ['ensg', 'symbol', 'ensembl', 'uniprot']:
+                    parameter_combinations.append([protein_interaction_dataset, pdi_dataset, identifier])
         # close all database connections so subprocesses will create their own connections
         # this prevents the processes from running into problems because of using the same connection
         db.connections.close_all()
diff --git a/drugstone/management/commands/populate_db.py b/drugstone/management/commands/populate_db.py
index 429cf6f..1fd5f5f 100755
--- a/drugstone/management/commands/populate_db.py
+++ b/drugstone/management/commands/populate_db.py
@@ -215,20 +215,20 @@ def populate(kwargs):
         print(f'Populated {n} DrDi associations from DrugBank.')
 
     if kwargs['protein_protein']:
-        print('Importing PPIs from unlicenced NeDRexDB...')
-        n = NedrexImporter.import_protein_protein_interactions(importer,
-                                                               DatasetLoader.get_ppi_nedrex(nedrex_api_url_unlicenced, False),
-                                                               update)
-        total_n += n
-        print(f'Imported {n} PPIs from unlicended NeDRexDB')
-        print('Importing PPIs from licenced NeDRexDB...')
-        n = NedrexImporter.import_protein_protein_interactions(importer,
-                                                               DatasetLoader.get_ppi_nedrex(nedrex_api_url_licenced,
-                                                                                            True),
-                                                               update)
-        total_n += n
-        nedrex_update = True
-        print(f'Imported {n} PPIs from licended NeDRexDB')
+        # print('Importing PPIs from unlicenced NeDRexDB...')
+        # n = NedrexImporter.import_protein_protein_interactions(importer,
+        #                                                        DatasetLoader.get_ppi_nedrex(nedrex_api_url_unlicenced, False),
+        #                                                        update)
+        # total_n += n
+        # print(f'Imported {n} PPIs from unlicended NeDRexDB')
+        # print('Importing PPIs from licenced NeDRexDB...')
+        # n = NedrexImporter.import_protein_protein_interactions(importer,
+        #                                                        DatasetLoader.get_ppi_nedrex(nedrex_api_url_licenced,
+        #                                                                                     True),
+        #                                                        update)
+        # total_n += n
+        # nedrex_update = True
+        # print(f'Imported {n} PPIs from licended NeDRexDB')
         print('Populating PPIs from STRING...')
         n = DataPopulator.populate_ppi_string(populator, DatasetLoader.get_ppi_string(), update)
         total_n += n
diff --git a/drugstone/views.py b/drugstone/views.py
index d923139..d7a03fa 100755
--- a/drugstone/views.py
+++ b/drugstone/views.py
@@ -78,11 +78,9 @@ class TaskView(APIView):
         licenced = parameters.get('licenced', False)
 
         # find databases based on parameter strings
-        print(get_ppi_ds(parameters.get('ppi_dataset', DEFAULTS['ppi']), licenced))
         parameters['ppi_dataset'] = PPIDatasetSerializer().to_representation(
             get_ppi_ds(parameters.get('ppi_dataset', DEFAULTS['ppi']), licenced))
 
-        print(get_pdi_ds(parameters.get('pdi_dataset', DEFAULTS['pdi']), licenced))
         parameters['pdi_dataset'] = PDIDatasetSerializer().to_representation(
             get_pdi_ds(parameters.get('pdi_dataset', DEFAULTS['pdi']), licenced))
 
@@ -177,10 +175,6 @@ def map_nodes(request) -> Response:
     nodes_mapped, id_key = query_proteins_by_identifier(node_ids, identifier)
 
     # change data structure to dict in order to be quicker when merging
-    # if identifier == 'ensg':
-    #     # a protein might have multiple ensg-numbers, unpack these into single nodes
-    #     nodes_mapped_dict = {node_id: node for node in nodes_mapped for node_id in node[id_key]}
-    # else:
     nodes_mapped_dict = {node[id_key][0]: node for node in nodes_mapped}
 
     # merge fetched data with given data to avoid data loss
@@ -265,10 +259,14 @@ def result_view(request) -> Response:
     drugs = []
 
     network = result['network']
-    node_types = {}
-    node_attributes['node_types'] = node_types
-    is_seed = {}
-    node_attributes['is_seed'] = is_seed
+    node_types = node_attributes.get('node_types')
+    if not node_types:
+        node_types = {}
+        node_attributes['node_types'] = node_types
+    is_seed = node_attributes.get('is_seed')
+    if not is_seed:
+        is_seed = {}
+        node_attributes['is_seed'] = is_seed
     scores = node_attributes.get('scores', {})
     node_details = {}
     protein_id_map = defaultdict(set)
@@ -286,7 +284,7 @@ def result_view(request) -> Response:
     # merge input network with result network
     for node in parameters['input_network']['nodes']:
         # if node was already mapped, add user defined values to result of analysis
-        if identifier in identifier_nodes:
+        if identifier in node:
             node_name = node[identifier][0]
             if node_name in node_details:
                 # update the node to not lose user input attributes
@@ -310,12 +308,32 @@ def result_view(request) -> Response:
             result['node_attributes']['node_types'][node_id] = 'custom'
     # extend the analysis network by the input netword nodes
     # map edge endpoints to database proteins if possible and add edges to analysis network
-
+    protein_nodes = set()
     # mapping all new protein and drug nodes by drugstoneIDs + adding scores
     for node_id in nodes:
+        if node_id[:2] == 'dr':
+            node_data = DrugSerializer().to_representation(Drug.objects.get(id=int(node_id[2:])))
+            node_data['drugstoneType'] = 'drug'
+            drugs.append(node_data)
+            if node_id in scores:
+                node_data['score'] = scores.get(node_id, None)
+            node_types[node_id] = 'drug'
+            node_details[node_id] = node_data
+        elif node_id[:2] != 'di':
+            protein_nodes.add(node_id)
+        else:
+            continue
+
+    nodes_mapped, _ = query_proteins_by_identifier(protein_nodes, identifier)
+
+    nodes_mapped_dict = {node[identifier][0]: node for node in nodes_mapped}
 
-        if node_id[0] == 'p':
-            node_data = ProteinNodeSerializer().to_representation(Protein.objects.get(id=int(node_id[1:])))
+    # merge fetched data with given data to avoid data loss
+    for node_id in nodes:
+        if node_id in nodes_mapped_dict:
+            # node.update(nodes_mapped_dict[node['id']])
+            node_data = nodes_mapped_dict[node_id]
+            node_data['drugstoneType'] = 'protein'
             # proteins.append(node_data)
             node_ident = node_data[identifier][0]
             # node_data[identifier] = [node_ident]
@@ -326,36 +344,20 @@ def result_view(request) -> Response:
             score = scores.get(node_id, None)
             if node_ident in node_details:
                 data = node_details[node_ident]
-                data['entrez'].extend(node_data['entrez'])
-                data['ensg'].extend(node_data['ensg'])
-                data['symbol'].extend(node_data['symbol'])
-                data['uniprot_ac'].extend(node_data['uniprot_ac'])
-                if score:
-                    if 'score' in data:
-                        data['score'].append(score)
-                    else:
-                        data['score'] = [score] if score else []
+                data['score'] = [score] if score else None
             else:
-                node_data['score'] = [score] if score else []
+                node_data['score'] = score if score else None
                 node_data['drugstoneType'] = 'protein'
                 node_data['id'] = node_ident
                 node_data['label'] = node_ident
                 node_details[node_ident] = node_data
 
-        elif node_id[:2] == 'dr':
-            node_data = DrugSerializer().to_representation(Drug.objects.get(id=int(node_id[2:])))
-            drugs.append(node_data)
-            if node_id in scores:
-                node_data['score'] = scores.get(node_id, None)
-            node_types[node_id] = 'drug'
-            node_details[node_id] = node_data
-        else:
-            continue
     for node_id, detail in node_details.items():
-        detail['symbol'] = list(set(detail['symbol']))
-        detail['entrez'] = list(set(detail['entrez']))
-        detail['uniprot_ac'] = list(set(detail['uniprot_ac']))
-        detail['ensg'] = list(set(detail['ensg']))
+        if 'drugstoneType' in detail and detail['drugstoneType'] == 'protein':
+            detail['symbol'] = list(set(detail['symbol']))
+            detail['entrez'] = list(set(detail['entrez']))
+            detail['uniprot_ac'] = list(set(detail['uniprot_ac']))
+            detail['ensg'] = list(set(detail['ensg']))
 
     edges = parameters['input_network']['edges']
     edge_endpoint_ids = set()
@@ -389,7 +391,12 @@ def result_view(request) -> Response:
             map(lambda n: {"from": f'p{n.from_protein_id}', "to": f'p{n.to_protein_id}'}, interaction_objects))
         edges.extend(auto_edges)
     result['network']['edges'].extend(edges)
-    result['network']['nodes'] = list(identifier_nodes)
+    # uniq_edges = dict()
+    # for edge in result['network']['edges']:
+    #     hash = edge['from'] + edge['to']
+    #     uniq_edges[hash] = edge
+    # result['network']['edges']=list(uniq_edges.values())
+    # result['network']['nodes'] = list(identifier_nodes)
     if 'scores' in result['node_attributes']:
         del result['node_attributes']['scores']
 
diff --git a/tasks/betweenness_centrality.py b/tasks/betweenness_centrality.py
index f763b1e..06d4ca0 100755
--- a/tasks/betweenness_centrality.py
+++ b/tasks/betweenness_centrality.py
@@ -172,15 +172,13 @@ def betweenness_centrality(task_hook: TaskHook):
 
     id_space = task_hook.parameters["config"].get("identifier","symbol")
 
-    print(id_space)
-
     # Parsing input file.
     task_hook.set_progress(0 / 3.0, "Parsing input.")
-    filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}"
+    filename = f"{id_space}_{ppi_dataset['name']}-{pdi_dataset['name']}"
     if ppi_dataset['licenced'] or pdi_dataset['licenced']:
         filename += "_licenced"
-    filename = os.path.join(task_hook.data_directory, filename+".gt")
-    g, seed_ids, id_space, drug_ids = read_graph_tool_graph(
+    filename = os.path.join(task_hook.data_directory, filename + ".gt")
+    g, seed_ids, drug_ids = read_graph_tool_graph(
         filename,
         seeds,
         max_deg,
diff --git a/tasks/closeness_centrality.py b/tasks/closeness_centrality.py
index 78222aa..7ae2772 100755
--- a/tasks/closeness_centrality.py
+++ b/tasks/closeness_centrality.py
@@ -170,13 +170,13 @@ def closeness_centrality(task_hook: TaskHook):
     
     # Parsing input file.
     task_hook.set_progress(0 / 4.0, "Parsing input.")
-    filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}"
-    if ppi_dataset['licenced'] or pdi_dataset['licenced']:
-        filename += "_licenced"
 
     id_space = task_hook.parameters["config"].get("identifier", "symbol")
 
-    filename = os.path.join(task_hook.data_directory, filename+".gt")
+    filename = f"{id_space}_{ppi_dataset['name']}-{pdi_dataset['name']}"
+    if ppi_dataset['licenced'] or pdi_dataset['licenced']:
+        filename += "_licenced"
+    filename = os.path.join(task_hook.data_directory, filename + ".gt")
     # g, seed_ids, viral_protein_ids, drug_ids = read_graph_tool_graph(file_path, seeds, datasets, ignored_edge_types, max_deg, ignore_non_seed_baits, include_indirect_drugs, include_non_approved_drugs)
     g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, id_space, max_deg, include_indirect_drugs, include_non_approved_drugs, search_target)
     task_hook.set_progress(1 / 4.0, "Computing edge weights.")
diff --git a/tasks/degree_centrality.py b/tasks/degree_centrality.py
index 1a709a0..e529c8a 100755
--- a/tasks/degree_centrality.py
+++ b/tasks/degree_centrality.py
@@ -150,13 +150,13 @@ def degree_centrality(task_hook: TaskHook):
     
     # Parsing input file.
     task_hook.set_progress(0 / 3.0, "Parsing input.")
-    filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}"
-    if ppi_dataset['licenced'] or pdi_dataset['licenced']:
-        filename += "_licenced"
 
     id_space = task_hook.parameters["config"].get("identifier", "symbol")
 
-    filename = os.path.join(task_hook.data_directory, filename+".gt")
+    filename = f"{id_space}_{ppi_dataset['name']}-{pdi_dataset['name']}"
+    if ppi_dataset['licenced'] or pdi_dataset['licenced']:
+        filename += "_licenced"
+    filename = os.path.join(task_hook.data_directory, filename + ".gt")
     # g, seed_ids, viral_protein_ids, drug_ids = read_graph_tool_graph(file_path, seeds, datasets, ignored_edge_types, max_deg, ignore_non_seed_baits, False, include_non_approved_drugs)
     g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, id_space, max_deg, False, include_non_approved_drugs, search_target)
     
diff --git a/tasks/keypathwayminer_task.py b/tasks/keypathwayminer_task.py
index dc26914..54080de 100755
--- a/tasks/keypathwayminer_task.py
+++ b/tasks/keypathwayminer_task.py
@@ -10,7 +10,7 @@ import requests
 
 from tasks.task_hook import TaskHook
 
-from drugstone.models import Protein
+from drugstone.models import Protein, EnsemblGene
 
 # Base URL
 # url = 'http://172.25.0.1:9003/keypathwayminer/requests/'
@@ -57,9 +57,18 @@ def kpm_task(task_hook: TaskHook):
     # --- Fetch and generate the datasets
     dataset_name = 'indicatorMatrix'
     indicator_matrix_string = ''
-    protein_backend_ids = [int(seed[1:]) for seed in task_hook.seeds]
-    proteins = Protein.objects.filter(id__in=protein_backend_ids)
-
+    id_space = task_hook.parameters["config"].get("identifier", "symbol")
+    proteins = []
+    if id_space == 'symbol':
+        proteins = Protein.objects.filter(gene__in=task_hook.seeds)
+    elif id_space == 'entrez':
+        proteins = Protein.objects.filter(entrez__in=task_hook.seeds)
+    elif id_space == 'uniprot':
+        proteins = Protein.objects.filter(uniprot_code__in=task_hook.seeds)
+    elif id_space == 'ensg':
+        protein_ids = {ensg.protein_id for ensg in EnsemblGene.objects.filter(name__in=task_hook.seeds)}
+        proteins = Protein.objects.filter(id__in=protein_ids)
+    protein_backend_ids = {p.id for p in proteins}
     for protein in proteins:
         indicator_matrix_string += f'{protein.uniprot_code}\t1\n'
 
diff --git a/tasks/multi_steiner.py b/tasks/multi_steiner.py
index 6e81bca..f361168 100755
--- a/tasks/multi_steiner.py
+++ b/tasks/multi_steiner.py
@@ -97,7 +97,7 @@ def multi_steiner(task_hook: TaskHook):
 
     search_target = task_hook.parameters.get("target", "drug-target")
 
-    node_name_attribute = "drugstone_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute
+    node_name_attribute = "internal_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute
 
     # Set number of threads if OpenMP support is enabled.
     if gt.openmp_enabled():
@@ -108,11 +108,10 @@ def multi_steiner(task_hook: TaskHook):
 
     id_space = task_hook.parameters["config"].get("identifier", "symbol")
 
-    filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}"
+    filename = f"{id_space}_{ppi_dataset['name']}-{pdi_dataset['name']}"
     if ppi_dataset['licenced'] or pdi_dataset['licenced']:
         filename += "_licenced"
-
-    filename = os.path.join(task_hook.data_directory, filename+".gt")
+    filename = os.path.join(task_hook.data_directory, filename + ".gt")
     g, seed_ids, _ = read_graph_tool_graph(filename, seeds, id_space, max_deg, target=search_target)
     # seed_map = {g.vertex_properties["name"][node]: node for node in seed_ids}
     seed_map = {g.vertex_properties[node_name_attribute][node]: node for node in seed_ids}
diff --git a/tasks/network_proximity.py b/tasks/network_proximity.py
index 6755d8e..716c7a3 100755
--- a/tasks/network_proximity.py
+++ b/tasks/network_proximity.py
@@ -89,10 +89,10 @@ def network_proximity(task_hook: TaskHook):
 
     id_space = task_hook.parameters["config"].get("identifier", "symbol")
 
-    filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}"
+    filename = f"{id_space}_{ppi_dataset['name']}-{pdi_dataset['name']}"
     if ppi_dataset['licenced'] or pdi_dataset['licenced']:
         filename += "_licenced"
-    filename = os.path.join(task_hook.data_directory, filename+".gt")
+    filename = os.path.join(task_hook.data_directory, filename + ".gt")
     # g, seed_ids, _, drug_ids = read_graph_tool_graph(file_path, seeds, "", "", max_deg, False, True, include_non_approved_drugs)
     g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, id_space, max_deg, True, include_non_approved_drugs, target=search_target)
     # Computing edge weights.
diff --git a/tasks/trust_rank.py b/tasks/trust_rank.py
index 4737bbe..fbcb5cc 100755
--- a/tasks/trust_rank.py
+++ b/tasks/trust_rank.py
@@ -201,7 +201,7 @@ def trust_rank(task_hook: TaskHook):
 
     id_space = task_hook.parameters["config"].get("identifier", "symbol")
 
-    filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}"
+    filename = f"{id_space}_{ppi_dataset['name']}-{pdi_dataset['name']}"
     if ppi_dataset['licenced'] or pdi_dataset['licenced']:
         filename += "_licenced"
     filename = os.path.join(task_hook.data_directory, filename+".gt")
diff --git a/tasks/util/find_bridges.py b/tasks/util/find_bridges.py
index 890c4f5..fb8cc98 100755
--- a/tasks/util/find_bridges.py
+++ b/tasks/util/find_bridges.py
@@ -17,7 +17,7 @@ def __dfs_find_bridges(g, node, visited, disc, low, parent, is_bridge):
             low[node] = min(low[node], low[nb])
             if low[nb] > disc[node]:
                 is_bridge[g.edge(node, nb)] = True
-        elif int(nb) != parent[node]: #TODO can in theory be removed because
+        elif int(nb) != parent[node]: #TODO can in theory be removed
             low[node] = min(low[node], disc[nb])
 
 def find_bridges(g):
diff --git a/tasks/util/read_graph_tool_graph.py b/tasks/util/read_graph_tool_graph.py
index d496f3b..92c1333 100755
--- a/tasks/util/read_graph_tool_graph.py
+++ b/tasks/util/read_graph_tool_graph.py
@@ -3,7 +3,8 @@ import graph_tool.topology as gtt
 
 
 # def read_graph_tool_graph(file_path, seeds, datasets, ignored_edge_types, max_deg, ignore_non_seed_baits=False, include_indirect_drugs=False, include_non_approved_drugs=False):
-def read_graph_tool_graph(file_path, seeds,id_space, max_deg, include_indirect_drugs=False, include_non_approved_drugs=False,
+def read_graph_tool_graph(file_path, seeds, id_space, max_deg, include_indirect_drugs=False,
+                          include_non_approved_drugs=False,
                           target='drug'):
     r"""Reads a graph-tool graph from file.
 
@@ -45,7 +46,7 @@ def read_graph_tool_graph(file_path, seeds,id_space, max_deg, include_indirect_d
 
     # drug_protein = "DrugHasTarget"
     d_type = "drug"
-    node_name_attribute = "drugstone_id"  # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute
+    node_name_attribute = "internal_id"  # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute
     # Delete all nodes that are not contained in the selected datasets and have degrees higher than max_deg
     deleted_nodes = []
     for node in range(g.num_vertices()):
@@ -59,20 +60,18 @@ def read_graph_tool_graph(file_path, seeds,id_space, max_deg, include_indirect_d
         # remove all drugs from graph if we are not looking for drugs
         elif target != 'drug' and g.vertex_properties["type"][node] == d_type:
             deleted_nodes.append(node)
-    g.remove_vertex(deleted_nodes, fast=True)
+
+    g.remove_vertex(reversed(sorted(deleted_nodes)), fast=True)
 
     # Retrieve internal IDs of seed_ids
     seeds = set(seeds)
-    print(seeds)
     seed_ids = {}
     drug_ids = []
-    # is_matched = {protein: False for protein in seeds}
     for node in range(g.num_vertices()):
         node_type = g.vertex_properties["type"][node]
         seed_id = g.vertex_properties[node_name_attribute][node]
         if seed_id in seeds:
             seed_ids[node] = seed_id
-            # is_matched[seed_id] = node
         if node_type == d_type:
             if include_non_approved_drugs:
                 drug_ids.append(node)
@@ -81,16 +80,6 @@ def read_graph_tool_graph(file_path, seeds,id_space, max_deg, include_indirect_d
                 if "approved" in drug_groups:
                     drug_ids.append(node)
 
-    # Check that all seed seeds have been matched and throw error, otherwise.
-    # print(deleted_nodes)
-    # print(seed_ids)
-    # seeds = set(seed_ids.values())
-    # for (node, seed_id) in seed_ids.items():
-    #     if is_matched[node]
-    # for protein, found in is_matched.items():
-    #     if not found:
-    #         raise ValueError("Invalid seed protein {}. No node named {} in {}.".format(protein, protein, file_path))
-
     # Delete edges that should be ignored or are not contained in the selected dataset.
     deleted_edges = []
 
@@ -138,17 +127,11 @@ def read_graph_tool_graph(file_path, seeds,id_space, max_deg, include_indirect_d
     for edge in deleted_edges:
         g.remove_edge(edge)
     g.set_fast_edge_removal(fast=False)
-    print("Drugs")
-    print(drug_ids)
-    print("Vertices")
     vertices = 0
     for _ in g.vertices():
         vertices += 1
-    print(f'\t{vertices}')
-    print("Edges")
     edges = 0
     for _ in g.edges():
         edges += 1
-    print(f'\t{edges}')
     # Return the graph and the indices of the seed_ids and the seeds.
     return g, list(seed_ids.keys()), drug_ids
diff --git a/tasks/util/scores_to_results.py b/tasks/util/scores_to_results.py
index 4f47663..e3b679e 100755
--- a/tasks/util/scores_to_results.py
+++ b/tasks/util/scores_to_results.py
@@ -15,7 +15,7 @@ def scores_to_results(
 
     r"""Transforms the scores to the required result format."""
 
-    node_name_attribute = "drugstone_id"  # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute
+    node_name_attribute = "internal_id"  # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute
     candidates = []
     # if strain_or_drugs == "drugs":
     if target == "drug":
@@ -23,8 +23,6 @@ def scores_to_results(
     else:
         candidates = [(node, scores[node]) for node in range(g.num_vertices()) if scores[node] > 0 and node not in set(seed_ids)]
     best_candidates = [item[0] for item in sorted(candidates, key=lambda item: item[1], reverse=True)[:result_size]]
-    print(f'Candidate list length: {len(best_candidates)}')
-
     # Concatenate best result candidates with seeds and compute induced subgraph.
     # since the result size filters out nodes, the result network is not complete anymore.
     # Therefore, it is necessary to find the shortest paths to the found nodes in case intermediate nodes have been removed. 
@@ -36,6 +34,7 @@ def scores_to_results(
     returned_nodes = set(seed_ids) # return seed_ids in any case
 
     # return only the path to a drug with the shortest distance
+    accepted_candidates = set()
     if filterPaths:
         for candidate in best_candidates:
             distances = gtt.shortest_distance(g, candidate, seed_ids)
@@ -53,11 +52,12 @@ def scores_to_results(
                         break
                 if drug_in_path:
                     continue
-
+                accepted_candidates.add(g.vertex_properties[node_name_attribute][int(candidate)])
                 for vertex in vertices:
                     if int(vertex) not in returned_nodes:
                         # inserting intermediate node in order to make result comprehensive
-                        intermediate_nodes.add(g.vertex_properties[node_name_attribute][int(vertex)])
+                        if vertex != candidate:
+                            intermediate_nodes.add(g.vertex_properties[node_name_attribute][int(vertex)])
                         returned_nodes.add(int(vertex))
                 for edge in edges:
                     if ((edge.source(), edge.target()) not in returned_edges) or ((edge.target(), edge.source()) not in returned_edges):
@@ -74,18 +74,21 @@ def scores_to_results(
                         break
                 if drug_in_path:
                     continue
-
+                accepted_candidates.add(g.vertex_properties[node_name_attribute][int(candidate)])
                 for vertex in vertices:
                     if int(vertex) not in returned_nodes:
                         # inserting intermediate node in order to make result comprehensive
-                        intermediate_nodes.add(g.vertex_properties[node_name_attribute][int(vertex)])
+                        if vertex != candidate:
+                            intermediate_nodes.add(g.vertex_properties[node_name_attribute][int(vertex)])
                         returned_nodes.add(int(vertex))
                 for edge in edges:
                     if ((edge.source(), edge.target()) not in returned_edges) or ((edge.target(), edge.source()) not in returned_edges):
                         returned_edges.add((edge.source(), edge.target()))
-    print(f'Returned nodes number: {len(returned_nodes)}')
+    for node in accepted_candidates:
+        if node in intermediate_nodes:
+            intermediate_nodes.remove(node)
     subgraph = {
-        "nodes": [g.vertex_properties[node_name_attribute][node] for node in returned_nodes],
+        "nodes":[g.vertex_properties[node_name_attribute][node] for node in returned_nodes],
         "edges": [{"from": g.vertex_properties[node_name_attribute][source], "to": g.vertex_properties[node_name_attribute][target]} for source, target in returned_edges],
         }
 
@@ -97,6 +100,7 @@ def scores_to_results(
     return {
         "network": subgraph,
         'intermediate_nodes': list(intermediate_nodes),
+        'target_nodes': list(accepted_candidates),
         "node_attributes":
             {
                 "node_types": node_types,
diff --git a/tasks/util/steiner_tree.py b/tasks/util/steiner_tree.py
index 27cbedb..f91f647 100755
--- a/tasks/util/steiner_tree.py
+++ b/tasks/util/steiner_tree.py
@@ -5,7 +5,7 @@ import itertools as it
 
 def steiner_tree(g, seeds, seed_map, weights, non_zero_hub_penalty):
 
-    node_name_attribute = "drugstone_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute
+    node_name_attribute = "internal_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute
     mc = gt.Graph(directed=False)
     eprop_dist = mc.new_edge_property("int")
     mc.ep['dist'] = eprop_dist
-- 
GitLab