From ff8f093b2021ac8e1dd90449d61196ae4ed76e61 Mon Sep 17 00:00:00 2001 From: AndiMajore <andi.majore@googlemail.com> Date: Tue, 19 Jul 2022 17:49:07 +0200 Subject: [PATCH] handling protein nodes now with multiple ids outside of main id space Former-commit-id: 6244c171b0ec97affbc2992e4e568b6d9d6c22d6 [formerly 8eb44b9fb87e1ca95eb0c09c98e7253b1f0ce702] Former-commit-id: 6e33edc763b4f47b04180e818f68097887204133 --- docker-django.env.dev | 2 +- drugstone/management/commands/make_graphs.py | 18 ++++++++---- drugstone/util/query_db.py | 30 ++++++++++++++++---- drugstone/views.py | 29 ++++++++++++++----- tasks/betweenness_centrality.py | 6 +++- tasks/closeness_centrality.py | 5 +++- tasks/degree_centrality.py | 5 +++- tasks/multi_steiner.py | 6 +++- tasks/network_proximity.py | 5 +++- tasks/trust_rank.py | 5 +++- tasks/util/read_graph_tool_graph.py | 2 +- 11 files changed, 88 insertions(+), 25 deletions(-) diff --git a/docker-django.env.dev b/docker-django.env.dev index 66f78b1..1f3d65d 100644 --- a/docker-django.env.dev +++ b/docker-django.env.dev @@ -14,4 +14,4 @@ DJANGO_SETTINGS_MODULE=drugstone.settings CELERY_BROKER_URL=redis://redis:6379/0 FLOWER_PORT=8888 FLOWER_BASIC_AUTH=drugstone:test -GT_THREADS=8 \ No newline at end of file +GT_THREADS=2 \ No newline at end of file diff --git a/drugstone/management/commands/make_graphs.py b/drugstone/management/commands/make_graphs.py index 98c7ce1..0ff9bb3 100755 --- a/drugstone/management/commands/make_graphs.py +++ b/drugstone/management/commands/make_graphs.py @@ -95,7 +95,9 @@ def create_gt(params: Tuple) -> None: v_type = g.new_vertex_property("string") v_name = g.new_vertex_property("string") v_drugstone_id = g.new_vertex_property("string") - v_entrez = g.new_vertex_property("string") + v_has_symbol = g.new_vertex_property("bool") + v_has_entrez = g.new_vertex_property("bool") + v_has_ensembl = g.new_vertex_property("bool") v_expression = g.new_vertex_property("string") # for drugs @@ -108,7 +110,9 @@ def create_gt(params: Tuple) -> None: g.vertex_properties["type"] = v_type g.vertex_properties["name"] = v_name g.vertex_properties["drugstone_id"] = v_drugstone_id - g.vertex_properties["entrez"] = v_entrez + g.vertex_properties["has_symbol"] = v_has_symbol + g.vertex_properties["has_entrez"] = v_has_entrez + g.vertex_properties["has_ensembl"] = v_has_ensembl g.vertex_properties["status"] = v_status g.vertex_properties["drug_id"] = v_drug_id g.vertex_properties["expression"] = v_expression @@ -122,11 +126,16 @@ def create_gt(params: Tuple) -> None: print(f'loading nodes') # extend node data by cancer nodes, we create a normal node for each cancer node. # on reading the data, we decide which one to keep based on the user selected cancer types + + has_ensembl_set = {node.protein_id for node in models.EnsemblGene.objects.all()} + for node in models.Protein.objects.all(): v = g.add_vertex() v_type[v] = 'protein' v_drugstone_id[v] = f"p{node.id}" - + v_has_symbol[v] = len(node.gene) != 0 + v_has_entrez[v] = len(node.entrez) != 0 + v_has_ensembl[v] = node.id in has_ensembl_set vertices[node.id] = v print("done with nodes") @@ -148,7 +157,6 @@ def create_gt(params: Tuple) -> None: e_type[e] = 'protein-protein' print("done with edges") - print(f'loading drug_edges/{pdi_dataset}') for edge_raw in _internal_pdis(pdi_dataset): e = g.add_edge(drug_vertices[edge_raw.drug_id], vertices[edge_raw.protein_id]) @@ -161,7 +169,7 @@ def create_gt(params: Tuple) -> None: if vertex.out_degree() == 0: delete_vertices.add(vertex) - #remove unconnected drugs + # remove unconnected drugs for vertex in drug_vertices.values(): if vertex.out_degree() == 0: delete_vertices.add(vertex) diff --git a/drugstone/util/query_db.py b/drugstone/util/query_db.py index 62e2e83..89fd8be 100644 --- a/drugstone/util/query_db.py +++ b/drugstone/util/query_db.py @@ -1,4 +1,5 @@ -from typing import List, Tuple, Set +from collections import defaultdict +from typing import List, Tuple, Set, OrderedDict from functools import reduce from django.db.models import Q from drugstone.models import Protein, EnsemblGene @@ -31,7 +32,8 @@ def query_proteins_by_identifier(node_ids: Set[str], identifier: str) -> Tuple[L q_list = map(lambda n: Q(uniprot_code__iexact=n), node_ids) elif identifier == 'ensg': protein_attribute = 'ensg' - node_ids = map(lambda n: n.protein_id, EnsemblGene.objects.filter(reduce(lambda a,b: a|b, map(lambda n:Q(name__iexact=n),list(node_ids))))) + node_ids = map(lambda n: n.protein_id, EnsemblGene.objects.filter( + reduce(lambda a, b: a | b, map(lambda n: Q(name__iexact=n), list(node_ids))))) q_list = map(lambda n: Q(id=n), node_ids) elif identifier == 'entrez': protein_attribute = 'entrez' @@ -40,9 +42,27 @@ def query_proteins_by_identifier(node_ids: Set[str], identifier: str) -> Tuple[L # node_ids is an empty list return [], protein_attribute q_list = reduce(lambda a, b: a | b, q_list) - node_objects = Protein.objects.filter(q_list) - # serialize - nodes = ProteinSerializer(many=True).to_representation(node_objects) + + nodes = list() + + node_map = defaultdict(list) + + for node in ProteinSerializer(many=True).to_representation(node_objects): + node_map[node.get(protein_attribute)].append(node) + for node_id, entries in node_map.items(): + nodes.append(aggregate_nodes(entries)) return nodes, protein_attribute + + +def aggregate_nodes(nodes: List[OrderedDict]): + node = defaultdict(set) + for n in nodes: + for key, value in n.items(): + if isinstance(value,list): + for e in value: + node[key].add(e) + else: + node[key].add(value) + return {k: list(v) for k, v in node.items()} diff --git a/drugstone/views.py b/drugstone/views.py index 19cde80..038fa88 100755 --- a/drugstone/views.py +++ b/drugstone/views.py @@ -58,7 +58,7 @@ def get_pdis_ds(source, licenced): def get_drdis_ds(source, licenced): try: - ds = models.PDisDataset.objects.filter(name__iexact=source, licenced=licenced).last() + ds = models.PDisDataset.objects.filter(name__iexact=source, licenced=licenced).last() ds.id return ds except: @@ -128,7 +128,14 @@ def fetch_edges(request) -> Response: Response: List of edges which are objects with 'from' and to ' attribtues' """ dataset = request.data.get('dataset', DEFAULTS['ppi']) - drugstone_ids = [node['drugstone_id'][1:] for node in request.data.get('nodes', '[]') if 'drugstone_id' in node] + drugstone_ids = set() + for node in request.data.get('nodes', '[]'): + if 'drugstone_id' in node: + if isinstance(node['drugstone_id'], list): + for id in node['drugstone_id']: + drugstone_ids.add(id[1:]) + else: + drugstone_ids.add(node['drugstone_id']) licenced = request.data.get('licenced', False) dataset_object = get_ppi_ds(dataset, licenced) interaction_objects = models.ProteinProteinInteraction.objects.filter( @@ -168,16 +175,19 @@ def map_nodes(request) -> Response: nodes_mapped, id_key = query_proteins_by_identifier(node_ids, identifier) # change data structure to dict in order to be quicker when merging - if identifier == 'ensg': - # a protein might have multiple ensg-numbers, unpack these into single nodes - nodes_mapped_dict = {node_id: node for node in nodes_mapped for node_id in node[id_key]} - else: - nodes_mapped_dict = {node[id_key]: node for node in nodes_mapped} + # if identifier == 'ensg': + # # a protein might have multiple ensg-numbers, unpack these into single nodes + # nodes_mapped_dict = {node_id: node for node in nodes_mapped for node_id in node[id_key]} + # else: + nodes_mapped_dict = {node[id_key][0]: node for node in nodes_mapped} # merge fetched data with given data to avoid data loss for node in nodes: + node['drugstoneType'] = 'other' if node['id'] in nodes_mapped_dict: node.update(nodes_mapped_dict[node['id']]) + node['drugstoneType'] = 'protein' node['id'] = id_map[node['id']] + # set label to node identifier if label is unset, otherwise # return list of nodes updated nodes return Response(nodes) @@ -489,6 +499,8 @@ def adjacent_disorders(request) -> Response: # serialize edges = DrugDisorderIndicationSerializer(many=True).to_representation(drdi_objects) disorders = DisorderSerializer(many=True).to_representation(disorders) + for d in disorders: + d['drugstone_type'] = 'disorder' return Response({ 'edges': edges, 'disorders': disorders, @@ -514,6 +526,9 @@ def adjacent_drugs(request) -> Response: # serialize pdis = ProteinDrugInteractionSerializer(many=True).to_representation(pdi_objects) drugs = DrugSerializer(many=True).to_representation(drugs) + for drug in drugs: + drug['drugstone_type'] = 'drug' + return Response({ 'pdis': pdis, 'drugs': drugs, diff --git a/tasks/betweenness_centrality.py b/tasks/betweenness_centrality.py index d839c5e..f763b1e 100755 --- a/tasks/betweenness_centrality.py +++ b/tasks/betweenness_centrality.py @@ -170,13 +170,17 @@ def betweenness_centrality(task_hook: TaskHook): filterPaths = task_hook.parameters.get("filter_paths", True) + id_space = task_hook.parameters["config"].get("identifier","symbol") + + print(id_space) + # Parsing input file. task_hook.set_progress(0 / 3.0, "Parsing input.") filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}" if ppi_dataset['licenced'] or pdi_dataset['licenced']: filename += "_licenced" filename = os.path.join(task_hook.data_directory, filename+".gt") - g, seed_ids, drug_ids = read_graph_tool_graph( + g, seed_ids, id_space, drug_ids = read_graph_tool_graph( filename, seeds, max_deg, diff --git a/tasks/closeness_centrality.py b/tasks/closeness_centrality.py index a90014f..78222aa 100755 --- a/tasks/closeness_centrality.py +++ b/tasks/closeness_centrality.py @@ -173,9 +173,12 @@ def closeness_centrality(task_hook: TaskHook): filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}" if ppi_dataset['licenced'] or pdi_dataset['licenced']: filename += "_licenced" + + id_space = task_hook.parameters["config"].get("identifier", "symbol") + filename = os.path.join(task_hook.data_directory, filename+".gt") # g, seed_ids, viral_protein_ids, drug_ids = read_graph_tool_graph(file_path, seeds, datasets, ignored_edge_types, max_deg, ignore_non_seed_baits, include_indirect_drugs, include_non_approved_drugs) - g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, max_deg, include_indirect_drugs, include_non_approved_drugs, search_target) + g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, id_space, max_deg, include_indirect_drugs, include_non_approved_drugs, search_target) task_hook.set_progress(1 / 4.0, "Computing edge weights.") weights = edge_weights(g, hub_penalty) diff --git a/tasks/degree_centrality.py b/tasks/degree_centrality.py index bceffec..1a709a0 100755 --- a/tasks/degree_centrality.py +++ b/tasks/degree_centrality.py @@ -153,9 +153,12 @@ def degree_centrality(task_hook: TaskHook): filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}" if ppi_dataset['licenced'] or pdi_dataset['licenced']: filename += "_licenced" + + id_space = task_hook.parameters["config"].get("identifier", "symbol") + filename = os.path.join(task_hook.data_directory, filename+".gt") # g, seed_ids, viral_protein_ids, drug_ids = read_graph_tool_graph(file_path, seeds, datasets, ignored_edge_types, max_deg, ignore_non_seed_baits, False, include_non_approved_drugs) - g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, max_deg, False, include_non_approved_drugs, search_target) + g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, id_space, max_deg, False, include_non_approved_drugs, search_target) # Set number of threads if OpenMP support is enabled. if gt.openmp_enabled(): diff --git a/tasks/multi_steiner.py b/tasks/multi_steiner.py index f8e9a80..6e81bca 100755 --- a/tasks/multi_steiner.py +++ b/tasks/multi_steiner.py @@ -105,11 +105,15 @@ def multi_steiner(task_hook: TaskHook): # Parsing input file. task_hook.set_progress(0 / (float(num_trees + 3)), "Parsing input.") + + id_space = task_hook.parameters["config"].get("identifier", "symbol") + filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}" if ppi_dataset['licenced'] or pdi_dataset['licenced']: filename += "_licenced" + filename = os.path.join(task_hook.data_directory, filename+".gt") - g, seed_ids, _ = read_graph_tool_graph(filename, seeds, max_deg, target=search_target) + g, seed_ids, _ = read_graph_tool_graph(filename, seeds, id_space, max_deg, target=search_target) # seed_map = {g.vertex_properties["name"][node]: node for node in seed_ids} seed_map = {g.vertex_properties[node_name_attribute][node]: node for node in seed_ids} task_hook.set_progress(1 / (float(num_trees + 3)), "Computing edge weights.") diff --git a/tasks/network_proximity.py b/tasks/network_proximity.py index 90b7708..6755d8e 100755 --- a/tasks/network_proximity.py +++ b/tasks/network_proximity.py @@ -86,12 +86,15 @@ def network_proximity(task_hook: TaskHook): # Parsing input file. task_hook.set_progress(0.0 / 8, "Parsing input.") + + id_space = task_hook.parameters["config"].get("identifier", "symbol") + filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}" if ppi_dataset['licenced'] or pdi_dataset['licenced']: filename += "_licenced" filename = os.path.join(task_hook.data_directory, filename+".gt") # g, seed_ids, _, drug_ids = read_graph_tool_graph(file_path, seeds, "", "", max_deg, False, True, include_non_approved_drugs) - g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, max_deg, True, include_non_approved_drugs, target=search_target) + g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, id_space, max_deg, True, include_non_approved_drugs, target=search_target) # Computing edge weights. task_hook.set_progress(1.0 / 8, "Computing edge weights.") weights = edge_weights(g, hub_penalty) diff --git a/tasks/trust_rank.py b/tasks/trust_rank.py index ceda67c..4737bbe 100755 --- a/tasks/trust_rank.py +++ b/tasks/trust_rank.py @@ -198,11 +198,14 @@ def trust_rank(task_hook: TaskHook): # Parsing input file. task_hook.set_progress(0 / 4.0, "Parsing input.") + + id_space = task_hook.parameters["config"].get("identifier", "symbol") + filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}" if ppi_dataset['licenced'] or pdi_dataset['licenced']: filename += "_licenced" filename = os.path.join(task_hook.data_directory, filename+".gt") - g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, max_deg, include_indirect_drugs, include_non_approved_drugs, search_target) + g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, id_space, max_deg, include_indirect_drugs, include_non_approved_drugs, search_target) task_hook.set_progress(1 / 4.0, "Computing edge weights.") weights = edge_weights(g, hub_penalty, inverse=True) diff --git a/tasks/util/read_graph_tool_graph.py b/tasks/util/read_graph_tool_graph.py index 4e7c9fe..d496f3b 100755 --- a/tasks/util/read_graph_tool_graph.py +++ b/tasks/util/read_graph_tool_graph.py @@ -3,7 +3,7 @@ import graph_tool.topology as gtt # def read_graph_tool_graph(file_path, seeds, datasets, ignored_edge_types, max_deg, ignore_non_seed_baits=False, include_indirect_drugs=False, include_non_approved_drugs=False): -def read_graph_tool_graph(file_path, seeds, max_deg, include_indirect_drugs=False, include_non_approved_drugs=False, +def read_graph_tool_graph(file_path, seeds,id_space, max_deg, include_indirect_drugs=False, include_non_approved_drugs=False, target='drug'): r"""Reads a graph-tool graph from file. -- GitLab