diff --git a/drugstone/management/commands/make_graphs.py b/drugstone/management/commands/make_graphs.py index 0ff9bb3c4f8e3fd6fc7d054e1aa5a1d02866552c..8794f481576041b4aa8423334f3b898b1353387d 100755 --- a/drugstone/management/commands/make_graphs.py +++ b/drugstone/management/commands/make_graphs.py @@ -1,3 +1,4 @@ +from collections import defaultdict from typing import List, Tuple import graph_tool.all as gt from drugstone import models @@ -77,7 +78,7 @@ def _internal_ppis(dataset) -> List[models.ProteinProteinInteraction]: return node_node_interaction_objects -def create_gt(params: Tuple) -> None: +def create_gt(params: List[str]) -> None: """Fetches all required information to build a graph-tools file for given PPI and PDI dataset names (params). Builds the graph-tools file and saves it in the data/Networks folder. @@ -85,37 +86,31 @@ def create_gt(params: Tuple) -> None: Args: params (Tuple[str, str]): Protein-protein-dataset name, Protein-drug-dataset name """ - ppi_dataset, pdi_dataset = params + ppi_dataset, pdi_dataset, identifier = params + licensed = ppi_dataset.licenced or pdi_dataset.licenced # get data from api g = gt.Graph(directed=False) + e_type = g.new_edge_property("string") v_type = g.new_vertex_property("string") v_name = g.new_vertex_property("string") - v_drugstone_id = g.new_vertex_property("string") - v_has_symbol = g.new_vertex_property("bool") - v_has_entrez = g.new_vertex_property("bool") - v_has_ensembl = g.new_vertex_property("bool") - v_expression = g.new_vertex_property("string") # for drugs v_status = g.new_vertex_property("string") v_drug_id = g.new_vertex_property("string") + v_internal_id = g.new_vertex_property("string") g.edge_properties["type"] = e_type - g.edge_properties["drugstone_id"] = e_type + # g.edge_properties["drugstone_id"] = e_type g.vertex_properties["type"] = v_type g.vertex_properties["name"] = v_name - g.vertex_properties["drugstone_id"] = v_drugstone_id - g.vertex_properties["has_symbol"] = v_has_symbol - g.vertex_properties["has_entrez"] = v_has_entrez - g.vertex_properties["has_ensembl"] = v_has_ensembl g.vertex_properties["status"] = v_status g.vertex_properties["drug_id"] = v_drug_id - g.vertex_properties["expression"] = v_expression + g.vertex_properties["internal_id"] = v_internal_id # store nodes to connect them when creating edges vertices = {} @@ -123,21 +118,46 @@ def create_gt(params: Tuple) -> None: # add vertices # print("adding nodes") - print(f'loading nodes') + print(f'loading nodes for {identifier}') # extend node data by cancer nodes, we create a normal node for each cancer node. # on reading the data, we decide which one to keep based on the user selected cancer types - has_ensembl_set = {node.protein_id for node in models.EnsemblGene.objects.all()} + is_entrez = identifier == 'entrez' + is_symbol = identifier == 'symbol' + is_uniprot = identifier == 'uniprot' + is_ensg = identifier == 'ensg' + + if is_ensg: + ensembl_set = defaultdict(set) + for node in models.EnsemblGene.objects.all(): + ensembl_set[node.protein_id].add(node.name) + + node_id_map = defaultdict(set) + drugstone_ids_to_node_ids = defaultdict(set) for node in models.Protein.objects.all(): + if is_entrez: + if len(node.entrez) != 0: + node_id_map[node.entrez].add(node.id) + drugstone_ids_to_node_ids[node.id].add(node.entrez) + elif is_symbol: + if len(node.gene) != 0: + node_id_map[node.gene].add(node.id) + drugstone_ids_to_node_ids[node.id].add(node.gene) + elif is_uniprot: + node_id_map[node.uniprot_code].add(node.id) + drugstone_ids_to_node_ids[node.id].add(node.uniprot_code) + elif is_ensg: + for id in ensembl_set[node.id]: + node_id_map[id].add(node.id) + drugstone_ids_to_node_ids[node.id].add(id) + + for id, nodes in node_id_map.items(): v = g.add_vertex() v_type[v] = 'protein' - v_drugstone_id[v] = f"p{node.id}" - v_has_symbol[v] = len(node.gene) != 0 - v_has_entrez[v] = len(node.entrez) != 0 - v_has_ensembl[v] = node.id in has_ensembl_set - vertices[node.id] = v - + v_internal_id[v] = id + for drugstone_id in nodes: + vertices[drugstone_id] = v print("done with nodes") print(f"adding drugs") @@ -145,22 +165,42 @@ def create_gt(params: Tuple) -> None: v = g.add_vertex() v_type[v] = 'drug' v_status[v] = node.status - v_drugstone_id[v] = f'dr{node.id}' + v_internal_id[v] = f'dr{node.id}' drug_vertices[node.id] = v + print("done with drugs") # add edges print(f'adding ppi_edges/{ppi_dataset}') + + uniq_edges = set() + for edge_raw in _internal_ppis(ppi_dataset): - e = g.add_edge(vertices[edge_raw.from_protein_id], vertices[edge_raw.to_protein_id]) - e_type[e] = 'protein-protein' + id1 = edge_raw.from_protein_id + id2 = edge_raw.to_protein_id + if id1 > id2: + tmp = id1 + id1 = id2 + id2 = tmp + hash = f'{id1}_{id2}' + if hash not in uniq_edges and id1 in vertices and id2 in vertices: + uniq_edges.add(hash) + e = g.add_edge(vertices[id1], vertices[id2]) + e_type[e] = 'protein-protein' print("done with edges") + uniq_edges = set() + print(f'loading drug_edges/{pdi_dataset}') for edge_raw in _internal_pdis(pdi_dataset): - e = g.add_edge(drug_vertices[edge_raw.drug_id], vertices[edge_raw.protein_id]) - e_type[e] = 'drug-protein' + id1 = edge_raw.drug_id + id2 = edge_raw.protein_id + hash = f'{id1}_{id2}' + if hash not in uniq_edges and id1 in drug_vertices and id2 in vertices: + uniq_edges.add(hash) + e = g.add_edge(drug_vertices[id1], vertices[id2]) + e_type[e] = 'drug-protein' print("done with drug edges") # remove unconnected proteins @@ -177,7 +217,7 @@ def create_gt(params: Tuple) -> None: g.remove_vertex(reversed(sorted(delete_vertices)), fast=True) # save graph - filename = f"./data/Networks/internal_{ppi_dataset.name}_{pdi_dataset.name}" + filename = f"./data/Networks/{identifier}_{ppi_dataset.name}-{pdi_dataset.name}" if licensed: filename += "_licenced" filename += ".gt" @@ -195,11 +235,25 @@ class Command(BaseCommand): pdi_datasets = models.PDIDataset.objects.all() + licenced_ppi_dataset = {ppi.name: ppi for ppi in ppi_datasets if ppi.licenced} + licenced_pdi_dataset = {pdi.name: pdi for pdi in pdi_datasets if pdi.licenced} + + uniq_combis = set() parameter_combinations = [] for protein_interaction_dataset in ppi_datasets: for pdi_dataset in pdi_datasets: - parameter_combinations.append((protein_interaction_dataset, pdi_dataset)) - + licenced = protein_interaction_dataset.licenced or pdi_dataset.licenced + if licenced: + protein_interaction_dataset = licenced_ppi_dataset[ + protein_interaction_dataset.name] if protein_interaction_dataset.name in licenced_ppi_dataset else protein_interaction_dataset + pdi_dataset = licenced_pdi_dataset[ + pdi_dataset.name] if pdi_dataset.name in licenced_pdi_dataset else pdi_dataset + hash = f'{protein_interaction_dataset.name}-{pdi_dataset.name}_{licenced}' + if hash in uniq_combis: + continue + uniq_combis.add(hash) + for identifier in ['ensg', 'symbol', 'ensembl', 'uniprot']: + parameter_combinations.append([protein_interaction_dataset, pdi_dataset, identifier]) # close all database connections so subprocesses will create their own connections # this prevents the processes from running into problems because of using the same connection db.connections.close_all() diff --git a/drugstone/management/commands/populate_db.py b/drugstone/management/commands/populate_db.py index 429cf6fc97b2fbc0ba07ec9b1051188b227ae9c0..1fd5f5faf04ec08285b2c2050b08dd05211daf1f 100755 --- a/drugstone/management/commands/populate_db.py +++ b/drugstone/management/commands/populate_db.py @@ -215,20 +215,20 @@ def populate(kwargs): print(f'Populated {n} DrDi associations from DrugBank.') if kwargs['protein_protein']: - print('Importing PPIs from unlicenced NeDRexDB...') - n = NedrexImporter.import_protein_protein_interactions(importer, - DatasetLoader.get_ppi_nedrex(nedrex_api_url_unlicenced, False), - update) - total_n += n - print(f'Imported {n} PPIs from unlicended NeDRexDB') - print('Importing PPIs from licenced NeDRexDB...') - n = NedrexImporter.import_protein_protein_interactions(importer, - DatasetLoader.get_ppi_nedrex(nedrex_api_url_licenced, - True), - update) - total_n += n - nedrex_update = True - print(f'Imported {n} PPIs from licended NeDRexDB') + # print('Importing PPIs from unlicenced NeDRexDB...') + # n = NedrexImporter.import_protein_protein_interactions(importer, + # DatasetLoader.get_ppi_nedrex(nedrex_api_url_unlicenced, False), + # update) + # total_n += n + # print(f'Imported {n} PPIs from unlicended NeDRexDB') + # print('Importing PPIs from licenced NeDRexDB...') + # n = NedrexImporter.import_protein_protein_interactions(importer, + # DatasetLoader.get_ppi_nedrex(nedrex_api_url_licenced, + # True), + # update) + # total_n += n + # nedrex_update = True + # print(f'Imported {n} PPIs from licended NeDRexDB') print('Populating PPIs from STRING...') n = DataPopulator.populate_ppi_string(populator, DatasetLoader.get_ppi_string(), update) total_n += n diff --git a/drugstone/views.py b/drugstone/views.py index d923139ac3edad8d55c0dd6410fba7df4fbc2ddc..d7a03fa35e973f204dc763030c4197126478c996 100755 --- a/drugstone/views.py +++ b/drugstone/views.py @@ -78,11 +78,9 @@ class TaskView(APIView): licenced = parameters.get('licenced', False) # find databases based on parameter strings - print(get_ppi_ds(parameters.get('ppi_dataset', DEFAULTS['ppi']), licenced)) parameters['ppi_dataset'] = PPIDatasetSerializer().to_representation( get_ppi_ds(parameters.get('ppi_dataset', DEFAULTS['ppi']), licenced)) - print(get_pdi_ds(parameters.get('pdi_dataset', DEFAULTS['pdi']), licenced)) parameters['pdi_dataset'] = PDIDatasetSerializer().to_representation( get_pdi_ds(parameters.get('pdi_dataset', DEFAULTS['pdi']), licenced)) @@ -177,10 +175,6 @@ def map_nodes(request) -> Response: nodes_mapped, id_key = query_proteins_by_identifier(node_ids, identifier) # change data structure to dict in order to be quicker when merging - # if identifier == 'ensg': - # # a protein might have multiple ensg-numbers, unpack these into single nodes - # nodes_mapped_dict = {node_id: node for node in nodes_mapped for node_id in node[id_key]} - # else: nodes_mapped_dict = {node[id_key][0]: node for node in nodes_mapped} # merge fetched data with given data to avoid data loss @@ -265,10 +259,14 @@ def result_view(request) -> Response: drugs = [] network = result['network'] - node_types = {} - node_attributes['node_types'] = node_types - is_seed = {} - node_attributes['is_seed'] = is_seed + node_types = node_attributes.get('node_types') + if not node_types: + node_types = {} + node_attributes['node_types'] = node_types + is_seed = node_attributes.get('is_seed') + if not is_seed: + is_seed = {} + node_attributes['is_seed'] = is_seed scores = node_attributes.get('scores', {}) node_details = {} protein_id_map = defaultdict(set) @@ -286,7 +284,7 @@ def result_view(request) -> Response: # merge input network with result network for node in parameters['input_network']['nodes']: # if node was already mapped, add user defined values to result of analysis - if identifier in identifier_nodes: + if identifier in node: node_name = node[identifier][0] if node_name in node_details: # update the node to not lose user input attributes @@ -310,12 +308,32 @@ def result_view(request) -> Response: result['node_attributes']['node_types'][node_id] = 'custom' # extend the analysis network by the input netword nodes # map edge endpoints to database proteins if possible and add edges to analysis network - + protein_nodes = set() # mapping all new protein and drug nodes by drugstoneIDs + adding scores for node_id in nodes: + if node_id[:2] == 'dr': + node_data = DrugSerializer().to_representation(Drug.objects.get(id=int(node_id[2:]))) + node_data['drugstoneType'] = 'drug' + drugs.append(node_data) + if node_id in scores: + node_data['score'] = scores.get(node_id, None) + node_types[node_id] = 'drug' + node_details[node_id] = node_data + elif node_id[:2] != 'di': + protein_nodes.add(node_id) + else: + continue + + nodes_mapped, _ = query_proteins_by_identifier(protein_nodes, identifier) + + nodes_mapped_dict = {node[identifier][0]: node for node in nodes_mapped} - if node_id[0] == 'p': - node_data = ProteinNodeSerializer().to_representation(Protein.objects.get(id=int(node_id[1:]))) + # merge fetched data with given data to avoid data loss + for node_id in nodes: + if node_id in nodes_mapped_dict: + # node.update(nodes_mapped_dict[node['id']]) + node_data = nodes_mapped_dict[node_id] + node_data['drugstoneType'] = 'protein' # proteins.append(node_data) node_ident = node_data[identifier][0] # node_data[identifier] = [node_ident] @@ -326,36 +344,20 @@ def result_view(request) -> Response: score = scores.get(node_id, None) if node_ident in node_details: data = node_details[node_ident] - data['entrez'].extend(node_data['entrez']) - data['ensg'].extend(node_data['ensg']) - data['symbol'].extend(node_data['symbol']) - data['uniprot_ac'].extend(node_data['uniprot_ac']) - if score: - if 'score' in data: - data['score'].append(score) - else: - data['score'] = [score] if score else [] + data['score'] = [score] if score else None else: - node_data['score'] = [score] if score else [] + node_data['score'] = score if score else None node_data['drugstoneType'] = 'protein' node_data['id'] = node_ident node_data['label'] = node_ident node_details[node_ident] = node_data - elif node_id[:2] == 'dr': - node_data = DrugSerializer().to_representation(Drug.objects.get(id=int(node_id[2:]))) - drugs.append(node_data) - if node_id in scores: - node_data['score'] = scores.get(node_id, None) - node_types[node_id] = 'drug' - node_details[node_id] = node_data - else: - continue for node_id, detail in node_details.items(): - detail['symbol'] = list(set(detail['symbol'])) - detail['entrez'] = list(set(detail['entrez'])) - detail['uniprot_ac'] = list(set(detail['uniprot_ac'])) - detail['ensg'] = list(set(detail['ensg'])) + if 'drugstoneType' in detail and detail['drugstoneType'] == 'protein': + detail['symbol'] = list(set(detail['symbol'])) + detail['entrez'] = list(set(detail['entrez'])) + detail['uniprot_ac'] = list(set(detail['uniprot_ac'])) + detail['ensg'] = list(set(detail['ensg'])) edges = parameters['input_network']['edges'] edge_endpoint_ids = set() @@ -389,7 +391,12 @@ def result_view(request) -> Response: map(lambda n: {"from": f'p{n.from_protein_id}', "to": f'p{n.to_protein_id}'}, interaction_objects)) edges.extend(auto_edges) result['network']['edges'].extend(edges) - result['network']['nodes'] = list(identifier_nodes) + # uniq_edges = dict() + # for edge in result['network']['edges']: + # hash = edge['from'] + edge['to'] + # uniq_edges[hash] = edge + # result['network']['edges']=list(uniq_edges.values()) + # result['network']['nodes'] = list(identifier_nodes) if 'scores' in result['node_attributes']: del result['node_attributes']['scores'] diff --git a/tasks/betweenness_centrality.py b/tasks/betweenness_centrality.py index f763b1ee5341150c26b7ddbfa633fc2878b3b227..06d4ca0b6ba384e4db02aff9ca5f4a192b3e7879 100755 --- a/tasks/betweenness_centrality.py +++ b/tasks/betweenness_centrality.py @@ -172,15 +172,13 @@ def betweenness_centrality(task_hook: TaskHook): id_space = task_hook.parameters["config"].get("identifier","symbol") - print(id_space) - # Parsing input file. task_hook.set_progress(0 / 3.0, "Parsing input.") - filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}" + filename = f"{id_space}_{ppi_dataset['name']}-{pdi_dataset['name']}" if ppi_dataset['licenced'] or pdi_dataset['licenced']: filename += "_licenced" - filename = os.path.join(task_hook.data_directory, filename+".gt") - g, seed_ids, id_space, drug_ids = read_graph_tool_graph( + filename = os.path.join(task_hook.data_directory, filename + ".gt") + g, seed_ids, drug_ids = read_graph_tool_graph( filename, seeds, max_deg, diff --git a/tasks/closeness_centrality.py b/tasks/closeness_centrality.py index 78222aaf39f2c205cf5406e656445bdadcc9276f..7ae2772e2f5429a158d78a7bace6a475cc4b6337 100755 --- a/tasks/closeness_centrality.py +++ b/tasks/closeness_centrality.py @@ -170,13 +170,13 @@ def closeness_centrality(task_hook: TaskHook): # Parsing input file. task_hook.set_progress(0 / 4.0, "Parsing input.") - filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}" - if ppi_dataset['licenced'] or pdi_dataset['licenced']: - filename += "_licenced" id_space = task_hook.parameters["config"].get("identifier", "symbol") - filename = os.path.join(task_hook.data_directory, filename+".gt") + filename = f"{id_space}_{ppi_dataset['name']}-{pdi_dataset['name']}" + if ppi_dataset['licenced'] or pdi_dataset['licenced']: + filename += "_licenced" + filename = os.path.join(task_hook.data_directory, filename + ".gt") # g, seed_ids, viral_protein_ids, drug_ids = read_graph_tool_graph(file_path, seeds, datasets, ignored_edge_types, max_deg, ignore_non_seed_baits, include_indirect_drugs, include_non_approved_drugs) g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, id_space, max_deg, include_indirect_drugs, include_non_approved_drugs, search_target) task_hook.set_progress(1 / 4.0, "Computing edge weights.") diff --git a/tasks/degree_centrality.py b/tasks/degree_centrality.py index 1a709a0fbd1f15432e58eda0accd883855d7abec..e529c8a6a73b7c2882f4d5142ff3be864fd014ff 100755 --- a/tasks/degree_centrality.py +++ b/tasks/degree_centrality.py @@ -150,13 +150,13 @@ def degree_centrality(task_hook: TaskHook): # Parsing input file. task_hook.set_progress(0 / 3.0, "Parsing input.") - filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}" - if ppi_dataset['licenced'] or pdi_dataset['licenced']: - filename += "_licenced" id_space = task_hook.parameters["config"].get("identifier", "symbol") - filename = os.path.join(task_hook.data_directory, filename+".gt") + filename = f"{id_space}_{ppi_dataset['name']}-{pdi_dataset['name']}" + if ppi_dataset['licenced'] or pdi_dataset['licenced']: + filename += "_licenced" + filename = os.path.join(task_hook.data_directory, filename + ".gt") # g, seed_ids, viral_protein_ids, drug_ids = read_graph_tool_graph(file_path, seeds, datasets, ignored_edge_types, max_deg, ignore_non_seed_baits, False, include_non_approved_drugs) g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, id_space, max_deg, False, include_non_approved_drugs, search_target) diff --git a/tasks/keypathwayminer_task.py b/tasks/keypathwayminer_task.py index dc2691401d82989ba5bcb345d7d163b505afa1d0..54080deed152ff2abcd99b1a31e75cc1e4dc046c 100755 --- a/tasks/keypathwayminer_task.py +++ b/tasks/keypathwayminer_task.py @@ -10,7 +10,7 @@ import requests from tasks.task_hook import TaskHook -from drugstone.models import Protein +from drugstone.models import Protein, EnsemblGene # Base URL # url = 'http://172.25.0.1:9003/keypathwayminer/requests/' @@ -57,9 +57,18 @@ def kpm_task(task_hook: TaskHook): # --- Fetch and generate the datasets dataset_name = 'indicatorMatrix' indicator_matrix_string = '' - protein_backend_ids = [int(seed[1:]) for seed in task_hook.seeds] - proteins = Protein.objects.filter(id__in=protein_backend_ids) - + id_space = task_hook.parameters["config"].get("identifier", "symbol") + proteins = [] + if id_space == 'symbol': + proteins = Protein.objects.filter(gene__in=task_hook.seeds) + elif id_space == 'entrez': + proteins = Protein.objects.filter(entrez__in=task_hook.seeds) + elif id_space == 'uniprot': + proteins = Protein.objects.filter(uniprot_code__in=task_hook.seeds) + elif id_space == 'ensg': + protein_ids = {ensg.protein_id for ensg in EnsemblGene.objects.filter(name__in=task_hook.seeds)} + proteins = Protein.objects.filter(id__in=protein_ids) + protein_backend_ids = {p.id for p in proteins} for protein in proteins: indicator_matrix_string += f'{protein.uniprot_code}\t1\n' diff --git a/tasks/multi_steiner.py b/tasks/multi_steiner.py index 6e81bcabe547d67e3d8dbbd1ff4c373ba4fec0bf..f3611687d3d44719e7cb1917f6de35116a2f25b4 100755 --- a/tasks/multi_steiner.py +++ b/tasks/multi_steiner.py @@ -97,7 +97,7 @@ def multi_steiner(task_hook: TaskHook): search_target = task_hook.parameters.get("target", "drug-target") - node_name_attribute = "drugstone_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute + node_name_attribute = "internal_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute # Set number of threads if OpenMP support is enabled. if gt.openmp_enabled(): @@ -108,11 +108,10 @@ def multi_steiner(task_hook: TaskHook): id_space = task_hook.parameters["config"].get("identifier", "symbol") - filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}" + filename = f"{id_space}_{ppi_dataset['name']}-{pdi_dataset['name']}" if ppi_dataset['licenced'] or pdi_dataset['licenced']: filename += "_licenced" - - filename = os.path.join(task_hook.data_directory, filename+".gt") + filename = os.path.join(task_hook.data_directory, filename + ".gt") g, seed_ids, _ = read_graph_tool_graph(filename, seeds, id_space, max_deg, target=search_target) # seed_map = {g.vertex_properties["name"][node]: node for node in seed_ids} seed_map = {g.vertex_properties[node_name_attribute][node]: node for node in seed_ids} diff --git a/tasks/network_proximity.py b/tasks/network_proximity.py index 6755d8ec5195eb7b8381a3e85fc9d2105480a016..716c7a324313c746acd35f8d256c168488bdafda 100755 --- a/tasks/network_proximity.py +++ b/tasks/network_proximity.py @@ -89,10 +89,10 @@ def network_proximity(task_hook: TaskHook): id_space = task_hook.parameters["config"].get("identifier", "symbol") - filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}" + filename = f"{id_space}_{ppi_dataset['name']}-{pdi_dataset['name']}" if ppi_dataset['licenced'] or pdi_dataset['licenced']: filename += "_licenced" - filename = os.path.join(task_hook.data_directory, filename+".gt") + filename = os.path.join(task_hook.data_directory, filename + ".gt") # g, seed_ids, _, drug_ids = read_graph_tool_graph(file_path, seeds, "", "", max_deg, False, True, include_non_approved_drugs) g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, id_space, max_deg, True, include_non_approved_drugs, target=search_target) # Computing edge weights. diff --git a/tasks/trust_rank.py b/tasks/trust_rank.py index 4737bbe13fe49bd20c389dfac1c6ae6f5f617745..fbcb5ccfd389f91234e1d48ae1c0c4083f6dd7e7 100755 --- a/tasks/trust_rank.py +++ b/tasks/trust_rank.py @@ -201,7 +201,7 @@ def trust_rank(task_hook: TaskHook): id_space = task_hook.parameters["config"].get("identifier", "symbol") - filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}" + filename = f"{id_space}_{ppi_dataset['name']}-{pdi_dataset['name']}" if ppi_dataset['licenced'] or pdi_dataset['licenced']: filename += "_licenced" filename = os.path.join(task_hook.data_directory, filename+".gt") diff --git a/tasks/util/find_bridges.py b/tasks/util/find_bridges.py index 890c4f5b4475b98b22eee83e2c294fc24e141b3c..fb8cc9863845af214b24e77bbbbff59d779f88c4 100755 --- a/tasks/util/find_bridges.py +++ b/tasks/util/find_bridges.py @@ -17,7 +17,7 @@ def __dfs_find_bridges(g, node, visited, disc, low, parent, is_bridge): low[node] = min(low[node], low[nb]) if low[nb] > disc[node]: is_bridge[g.edge(node, nb)] = True - elif int(nb) != parent[node]: #TODO can in theory be removed because + elif int(nb) != parent[node]: #TODO can in theory be removed low[node] = min(low[node], disc[nb]) def find_bridges(g): diff --git a/tasks/util/read_graph_tool_graph.py b/tasks/util/read_graph_tool_graph.py index d496f3b87324038416ba56d2032f06efa9852eb2..92c1333c383fef4547c2b9846b54a04bbbbb62b6 100755 --- a/tasks/util/read_graph_tool_graph.py +++ b/tasks/util/read_graph_tool_graph.py @@ -3,7 +3,8 @@ import graph_tool.topology as gtt # def read_graph_tool_graph(file_path, seeds, datasets, ignored_edge_types, max_deg, ignore_non_seed_baits=False, include_indirect_drugs=False, include_non_approved_drugs=False): -def read_graph_tool_graph(file_path, seeds,id_space, max_deg, include_indirect_drugs=False, include_non_approved_drugs=False, +def read_graph_tool_graph(file_path, seeds, id_space, max_deg, include_indirect_drugs=False, + include_non_approved_drugs=False, target='drug'): r"""Reads a graph-tool graph from file. @@ -45,7 +46,7 @@ def read_graph_tool_graph(file_path, seeds,id_space, max_deg, include_indirect_d # drug_protein = "DrugHasTarget" d_type = "drug" - node_name_attribute = "drugstone_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute + node_name_attribute = "internal_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute # Delete all nodes that are not contained in the selected datasets and have degrees higher than max_deg deleted_nodes = [] for node in range(g.num_vertices()): @@ -59,20 +60,18 @@ def read_graph_tool_graph(file_path, seeds,id_space, max_deg, include_indirect_d # remove all drugs from graph if we are not looking for drugs elif target != 'drug' and g.vertex_properties["type"][node] == d_type: deleted_nodes.append(node) - g.remove_vertex(deleted_nodes, fast=True) + + g.remove_vertex(reversed(sorted(deleted_nodes)), fast=True) # Retrieve internal IDs of seed_ids seeds = set(seeds) - print(seeds) seed_ids = {} drug_ids = [] - # is_matched = {protein: False for protein in seeds} for node in range(g.num_vertices()): node_type = g.vertex_properties["type"][node] seed_id = g.vertex_properties[node_name_attribute][node] if seed_id in seeds: seed_ids[node] = seed_id - # is_matched[seed_id] = node if node_type == d_type: if include_non_approved_drugs: drug_ids.append(node) @@ -81,16 +80,6 @@ def read_graph_tool_graph(file_path, seeds,id_space, max_deg, include_indirect_d if "approved" in drug_groups: drug_ids.append(node) - # Check that all seed seeds have been matched and throw error, otherwise. - # print(deleted_nodes) - # print(seed_ids) - # seeds = set(seed_ids.values()) - # for (node, seed_id) in seed_ids.items(): - # if is_matched[node] - # for protein, found in is_matched.items(): - # if not found: - # raise ValueError("Invalid seed protein {}. No node named {} in {}.".format(protein, protein, file_path)) - # Delete edges that should be ignored or are not contained in the selected dataset. deleted_edges = [] @@ -138,17 +127,11 @@ def read_graph_tool_graph(file_path, seeds,id_space, max_deg, include_indirect_d for edge in deleted_edges: g.remove_edge(edge) g.set_fast_edge_removal(fast=False) - print("Drugs") - print(drug_ids) - print("Vertices") vertices = 0 for _ in g.vertices(): vertices += 1 - print(f'\t{vertices}') - print("Edges") edges = 0 for _ in g.edges(): edges += 1 - print(f'\t{edges}') # Return the graph and the indices of the seed_ids and the seeds. return g, list(seed_ids.keys()), drug_ids diff --git a/tasks/util/scores_to_results.py b/tasks/util/scores_to_results.py index 4f47663b95624497843710b70a72c55cd12a61a8..e3b679ec417a2cb4e7b6e0df268b642c3073dce5 100755 --- a/tasks/util/scores_to_results.py +++ b/tasks/util/scores_to_results.py @@ -15,7 +15,7 @@ def scores_to_results( r"""Transforms the scores to the required result format.""" - node_name_attribute = "drugstone_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute + node_name_attribute = "internal_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute candidates = [] # if strain_or_drugs == "drugs": if target == "drug": @@ -23,8 +23,6 @@ def scores_to_results( else: candidates = [(node, scores[node]) for node in range(g.num_vertices()) if scores[node] > 0 and node not in set(seed_ids)] best_candidates = [item[0] for item in sorted(candidates, key=lambda item: item[1], reverse=True)[:result_size]] - print(f'Candidate list length: {len(best_candidates)}') - # Concatenate best result candidates with seeds and compute induced subgraph. # since the result size filters out nodes, the result network is not complete anymore. # Therefore, it is necessary to find the shortest paths to the found nodes in case intermediate nodes have been removed. @@ -36,6 +34,7 @@ def scores_to_results( returned_nodes = set(seed_ids) # return seed_ids in any case # return only the path to a drug with the shortest distance + accepted_candidates = set() if filterPaths: for candidate in best_candidates: distances = gtt.shortest_distance(g, candidate, seed_ids) @@ -53,11 +52,12 @@ def scores_to_results( break if drug_in_path: continue - + accepted_candidates.add(g.vertex_properties[node_name_attribute][int(candidate)]) for vertex in vertices: if int(vertex) not in returned_nodes: # inserting intermediate node in order to make result comprehensive - intermediate_nodes.add(g.vertex_properties[node_name_attribute][int(vertex)]) + if vertex != candidate: + intermediate_nodes.add(g.vertex_properties[node_name_attribute][int(vertex)]) returned_nodes.add(int(vertex)) for edge in edges: if ((edge.source(), edge.target()) not in returned_edges) or ((edge.target(), edge.source()) not in returned_edges): @@ -74,18 +74,21 @@ def scores_to_results( break if drug_in_path: continue - + accepted_candidates.add(g.vertex_properties[node_name_attribute][int(candidate)]) for vertex in vertices: if int(vertex) not in returned_nodes: # inserting intermediate node in order to make result comprehensive - intermediate_nodes.add(g.vertex_properties[node_name_attribute][int(vertex)]) + if vertex != candidate: + intermediate_nodes.add(g.vertex_properties[node_name_attribute][int(vertex)]) returned_nodes.add(int(vertex)) for edge in edges: if ((edge.source(), edge.target()) not in returned_edges) or ((edge.target(), edge.source()) not in returned_edges): returned_edges.add((edge.source(), edge.target())) - print(f'Returned nodes number: {len(returned_nodes)}') + for node in accepted_candidates: + if node in intermediate_nodes: + intermediate_nodes.remove(node) subgraph = { - "nodes": [g.vertex_properties[node_name_attribute][node] for node in returned_nodes], + "nodes":[g.vertex_properties[node_name_attribute][node] for node in returned_nodes], "edges": [{"from": g.vertex_properties[node_name_attribute][source], "to": g.vertex_properties[node_name_attribute][target]} for source, target in returned_edges], } @@ -97,6 +100,7 @@ def scores_to_results( return { "network": subgraph, 'intermediate_nodes': list(intermediate_nodes), + 'target_nodes': list(accepted_candidates), "node_attributes": { "node_types": node_types, diff --git a/tasks/util/steiner_tree.py b/tasks/util/steiner_tree.py index 27cbedb551610ce66b7b1a43cc740744e4b753ad..f91f6471f51939becd4f81594d4ac7ecc81d6818 100755 --- a/tasks/util/steiner_tree.py +++ b/tasks/util/steiner_tree.py @@ -5,7 +5,7 @@ import itertools as it def steiner_tree(g, seeds, seed_map, weights, non_zero_hub_penalty): - node_name_attribute = "drugstone_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute + node_name_attribute = "internal_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute mc = gt.Graph(directed=False) eprop_dist = mc.new_edge_property("int") mc.ep['dist'] = eprop_dist