Skip to content
Snippets Groups Projects
Commit 50c346aa authored by AndiMajore's avatar AndiMajore
Browse files

handling protein nodes now with multiple ids outside of main id space

parent d8bb91fa
No related branches found
No related tags found
No related merge requests found
...@@ -14,4 +14,4 @@ DJANGO_SETTINGS_MODULE=drugstone.settings ...@@ -14,4 +14,4 @@ DJANGO_SETTINGS_MODULE=drugstone.settings
CELERY_BROKER_URL=redis://redis:6379/0 CELERY_BROKER_URL=redis://redis:6379/0
FLOWER_PORT=8888 FLOWER_PORT=8888
FLOWER_BASIC_AUTH=drugstone:test FLOWER_BASIC_AUTH=drugstone:test
GT_THREADS=8 GT_THREADS=2
\ No newline at end of file \ No newline at end of file
...@@ -95,7 +95,9 @@ def create_gt(params: Tuple) -> None: ...@@ -95,7 +95,9 @@ def create_gt(params: Tuple) -> None:
v_type = g.new_vertex_property("string") v_type = g.new_vertex_property("string")
v_name = g.new_vertex_property("string") v_name = g.new_vertex_property("string")
v_drugstone_id = g.new_vertex_property("string") v_drugstone_id = g.new_vertex_property("string")
v_entrez = g.new_vertex_property("string") v_has_symbol = g.new_vertex_property("bool")
v_has_entrez = g.new_vertex_property("bool")
v_has_ensembl = g.new_vertex_property("bool")
v_expression = g.new_vertex_property("string") v_expression = g.new_vertex_property("string")
# for drugs # for drugs
...@@ -108,7 +110,9 @@ def create_gt(params: Tuple) -> None: ...@@ -108,7 +110,9 @@ def create_gt(params: Tuple) -> None:
g.vertex_properties["type"] = v_type g.vertex_properties["type"] = v_type
g.vertex_properties["name"] = v_name g.vertex_properties["name"] = v_name
g.vertex_properties["drugstone_id"] = v_drugstone_id g.vertex_properties["drugstone_id"] = v_drugstone_id
g.vertex_properties["entrez"] = v_entrez g.vertex_properties["has_symbol"] = v_has_symbol
g.vertex_properties["has_entrez"] = v_has_entrez
g.vertex_properties["has_ensembl"] = v_has_ensembl
g.vertex_properties["status"] = v_status g.vertex_properties["status"] = v_status
g.vertex_properties["drug_id"] = v_drug_id g.vertex_properties["drug_id"] = v_drug_id
g.vertex_properties["expression"] = v_expression g.vertex_properties["expression"] = v_expression
...@@ -122,11 +126,16 @@ def create_gt(params: Tuple) -> None: ...@@ -122,11 +126,16 @@ def create_gt(params: Tuple) -> None:
print(f'loading nodes') print(f'loading nodes')
# extend node data by cancer nodes, we create a normal node for each cancer node. # extend node data by cancer nodes, we create a normal node for each cancer node.
# on reading the data, we decide which one to keep based on the user selected cancer types # on reading the data, we decide which one to keep based on the user selected cancer types
has_ensembl_set = {node.protein_id for node in models.EnsemblGene.objects.all()}
for node in models.Protein.objects.all(): for node in models.Protein.objects.all():
v = g.add_vertex() v = g.add_vertex()
v_type[v] = 'protein' v_type[v] = 'protein'
v_drugstone_id[v] = f"p{node.id}" v_drugstone_id[v] = f"p{node.id}"
v_has_symbol[v] = len(node.gene) != 0
v_has_entrez[v] = len(node.entrez) != 0
v_has_ensembl[v] = node.id in has_ensembl_set
vertices[node.id] = v vertices[node.id] = v
print("done with nodes") print("done with nodes")
...@@ -148,7 +157,6 @@ def create_gt(params: Tuple) -> None: ...@@ -148,7 +157,6 @@ def create_gt(params: Tuple) -> None:
e_type[e] = 'protein-protein' e_type[e] = 'protein-protein'
print("done with edges") print("done with edges")
print(f'loading drug_edges/{pdi_dataset}') print(f'loading drug_edges/{pdi_dataset}')
for edge_raw in _internal_pdis(pdi_dataset): for edge_raw in _internal_pdis(pdi_dataset):
e = g.add_edge(drug_vertices[edge_raw.drug_id], vertices[edge_raw.protein_id]) e = g.add_edge(drug_vertices[edge_raw.drug_id], vertices[edge_raw.protein_id])
...@@ -161,7 +169,7 @@ def create_gt(params: Tuple) -> None: ...@@ -161,7 +169,7 @@ def create_gt(params: Tuple) -> None:
if vertex.out_degree() == 0: if vertex.out_degree() == 0:
delete_vertices.add(vertex) delete_vertices.add(vertex)
#remove unconnected drugs # remove unconnected drugs
for vertex in drug_vertices.values(): for vertex in drug_vertices.values():
if vertex.out_degree() == 0: if vertex.out_degree() == 0:
delete_vertices.add(vertex) delete_vertices.add(vertex)
......
from typing import List, Tuple, Set from collections import defaultdict
from typing import List, Tuple, Set, OrderedDict
from functools import reduce from functools import reduce
from django.db.models import Q from django.db.models import Q
from drugstone.models import Protein, EnsemblGene from drugstone.models import Protein, EnsemblGene
...@@ -31,7 +32,8 @@ def query_proteins_by_identifier(node_ids: Set[str], identifier: str) -> Tuple[L ...@@ -31,7 +32,8 @@ def query_proteins_by_identifier(node_ids: Set[str], identifier: str) -> Tuple[L
q_list = map(lambda n: Q(uniprot_code__iexact=n), node_ids) q_list = map(lambda n: Q(uniprot_code__iexact=n), node_ids)
elif identifier == 'ensg': elif identifier == 'ensg':
protein_attribute = 'ensg' protein_attribute = 'ensg'
node_ids = map(lambda n: n.protein_id, EnsemblGene.objects.filter(reduce(lambda a,b: a|b, map(lambda n:Q(name__iexact=n),list(node_ids))))) node_ids = map(lambda n: n.protein_id, EnsemblGene.objects.filter(
reduce(lambda a, b: a | b, map(lambda n: Q(name__iexact=n), list(node_ids)))))
q_list = map(lambda n: Q(id=n), node_ids) q_list = map(lambda n: Q(id=n), node_ids)
elif identifier == 'entrez': elif identifier == 'entrez':
protein_attribute = 'entrez' protein_attribute = 'entrez'
...@@ -40,9 +42,27 @@ def query_proteins_by_identifier(node_ids: Set[str], identifier: str) -> Tuple[L ...@@ -40,9 +42,27 @@ def query_proteins_by_identifier(node_ids: Set[str], identifier: str) -> Tuple[L
# node_ids is an empty list # node_ids is an empty list
return [], protein_attribute return [], protein_attribute
q_list = reduce(lambda a, b: a | b, q_list) q_list = reduce(lambda a, b: a | b, q_list)
node_objects = Protein.objects.filter(q_list) node_objects = Protein.objects.filter(q_list)
# serialize
nodes = ProteinSerializer(many=True).to_representation(node_objects) nodes = list()
node_map = defaultdict(list)
for node in ProteinSerializer(many=True).to_representation(node_objects):
node_map[node.get(protein_attribute)].append(node)
for node_id, entries in node_map.items():
nodes.append(aggregate_nodes(entries))
return nodes, protein_attribute return nodes, protein_attribute
def aggregate_nodes(nodes: List[OrderedDict]):
node = defaultdict(set)
for n in nodes:
for key, value in n.items():
if isinstance(value,list):
for e in value:
node[key].add(e)
else:
node[key].add(value)
return {k: list(v) for k, v in node.items()}
...@@ -58,7 +58,7 @@ def get_pdis_ds(source, licenced): ...@@ -58,7 +58,7 @@ def get_pdis_ds(source, licenced):
def get_drdis_ds(source, licenced): def get_drdis_ds(source, licenced):
try: try:
ds = models.PDisDataset.objects.filter(name__iexact=source, licenced=licenced).last() ds = models.PDisDataset.objects.filter(name__iexact=source, licenced=licenced).last()
ds.id ds.id
return ds return ds
except: except:
...@@ -128,7 +128,14 @@ def fetch_edges(request) -> Response: ...@@ -128,7 +128,14 @@ def fetch_edges(request) -> Response:
Response: List of edges which are objects with 'from' and to ' attribtues' Response: List of edges which are objects with 'from' and to ' attribtues'
""" """
dataset = request.data.get('dataset', DEFAULTS['ppi']) dataset = request.data.get('dataset', DEFAULTS['ppi'])
drugstone_ids = [node['drugstone_id'][1:] for node in request.data.get('nodes', '[]') if 'drugstone_id' in node] drugstone_ids = set()
for node in request.data.get('nodes', '[]'):
if 'drugstone_id' in node:
if isinstance(node['drugstone_id'], list):
for id in node['drugstone_id']:
drugstone_ids.add(id[1:])
else:
drugstone_ids.add(node['drugstone_id'])
licenced = request.data.get('licenced', False) licenced = request.data.get('licenced', False)
dataset_object = get_ppi_ds(dataset, licenced) dataset_object = get_ppi_ds(dataset, licenced)
interaction_objects = models.ProteinProteinInteraction.objects.filter( interaction_objects = models.ProteinProteinInteraction.objects.filter(
...@@ -168,16 +175,19 @@ def map_nodes(request) -> Response: ...@@ -168,16 +175,19 @@ def map_nodes(request) -> Response:
nodes_mapped, id_key = query_proteins_by_identifier(node_ids, identifier) nodes_mapped, id_key = query_proteins_by_identifier(node_ids, identifier)
# change data structure to dict in order to be quicker when merging # change data structure to dict in order to be quicker when merging
if identifier == 'ensg': # if identifier == 'ensg':
# a protein might have multiple ensg-numbers, unpack these into single nodes # # a protein might have multiple ensg-numbers, unpack these into single nodes
nodes_mapped_dict = {node_id: node for node in nodes_mapped for node_id in node[id_key]} # nodes_mapped_dict = {node_id: node for node in nodes_mapped for node_id in node[id_key]}
else: # else:
nodes_mapped_dict = {node[id_key]: node for node in nodes_mapped} nodes_mapped_dict = {node[id_key][0]: node for node in nodes_mapped}
# merge fetched data with given data to avoid data loss # merge fetched data with given data to avoid data loss
for node in nodes: for node in nodes:
node['drugstoneType'] = 'other'
if node['id'] in nodes_mapped_dict: if node['id'] in nodes_mapped_dict:
node.update(nodes_mapped_dict[node['id']]) node.update(nodes_mapped_dict[node['id']])
node['drugstoneType'] = 'protein'
node['id'] = id_map[node['id']] node['id'] = id_map[node['id']]
# set label to node identifier if label is unset, otherwise # set label to node identifier if label is unset, otherwise
# return list of nodes updated nodes # return list of nodes updated nodes
return Response(nodes) return Response(nodes)
...@@ -489,6 +499,8 @@ def adjacent_disorders(request) -> Response: ...@@ -489,6 +499,8 @@ def adjacent_disorders(request) -> Response:
# serialize # serialize
edges = DrugDisorderIndicationSerializer(many=True).to_representation(drdi_objects) edges = DrugDisorderIndicationSerializer(many=True).to_representation(drdi_objects)
disorders = DisorderSerializer(many=True).to_representation(disorders) disorders = DisorderSerializer(many=True).to_representation(disorders)
for d in disorders:
d['drugstone_type'] = 'disorder'
return Response({ return Response({
'edges': edges, 'edges': edges,
'disorders': disorders, 'disorders': disorders,
...@@ -514,6 +526,9 @@ def adjacent_drugs(request) -> Response: ...@@ -514,6 +526,9 @@ def adjacent_drugs(request) -> Response:
# serialize # serialize
pdis = ProteinDrugInteractionSerializer(many=True).to_representation(pdi_objects) pdis = ProteinDrugInteractionSerializer(many=True).to_representation(pdi_objects)
drugs = DrugSerializer(many=True).to_representation(drugs) drugs = DrugSerializer(many=True).to_representation(drugs)
for drug in drugs:
drug['drugstone_type'] = 'drug'
return Response({ return Response({
'pdis': pdis, 'pdis': pdis,
'drugs': drugs, 'drugs': drugs,
......
...@@ -170,13 +170,17 @@ def betweenness_centrality(task_hook: TaskHook): ...@@ -170,13 +170,17 @@ def betweenness_centrality(task_hook: TaskHook):
filterPaths = task_hook.parameters.get("filter_paths", True) filterPaths = task_hook.parameters.get("filter_paths", True)
id_space = task_hook.parameters["config"].get("identifier","symbol")
print(id_space)
# Parsing input file. # Parsing input file.
task_hook.set_progress(0 / 3.0, "Parsing input.") task_hook.set_progress(0 / 3.0, "Parsing input.")
filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}" filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}"
if ppi_dataset['licenced'] or pdi_dataset['licenced']: if ppi_dataset['licenced'] or pdi_dataset['licenced']:
filename += "_licenced" filename += "_licenced"
filename = os.path.join(task_hook.data_directory, filename+".gt") filename = os.path.join(task_hook.data_directory, filename+".gt")
g, seed_ids, drug_ids = read_graph_tool_graph( g, seed_ids, id_space, drug_ids = read_graph_tool_graph(
filename, filename,
seeds, seeds,
max_deg, max_deg,
......
...@@ -173,9 +173,12 @@ def closeness_centrality(task_hook: TaskHook): ...@@ -173,9 +173,12 @@ def closeness_centrality(task_hook: TaskHook):
filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}" filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}"
if ppi_dataset['licenced'] or pdi_dataset['licenced']: if ppi_dataset['licenced'] or pdi_dataset['licenced']:
filename += "_licenced" filename += "_licenced"
id_space = task_hook.parameters["config"].get("identifier", "symbol")
filename = os.path.join(task_hook.data_directory, filename+".gt") filename = os.path.join(task_hook.data_directory, filename+".gt")
# g, seed_ids, viral_protein_ids, drug_ids = read_graph_tool_graph(file_path, seeds, datasets, ignored_edge_types, max_deg, ignore_non_seed_baits, include_indirect_drugs, include_non_approved_drugs) # g, seed_ids, viral_protein_ids, drug_ids = read_graph_tool_graph(file_path, seeds, datasets, ignored_edge_types, max_deg, ignore_non_seed_baits, include_indirect_drugs, include_non_approved_drugs)
g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, max_deg, include_indirect_drugs, include_non_approved_drugs, search_target) g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, id_space, max_deg, include_indirect_drugs, include_non_approved_drugs, search_target)
task_hook.set_progress(1 / 4.0, "Computing edge weights.") task_hook.set_progress(1 / 4.0, "Computing edge weights.")
weights = edge_weights(g, hub_penalty) weights = edge_weights(g, hub_penalty)
......
...@@ -153,9 +153,12 @@ def degree_centrality(task_hook: TaskHook): ...@@ -153,9 +153,12 @@ def degree_centrality(task_hook: TaskHook):
filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}" filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}"
if ppi_dataset['licenced'] or pdi_dataset['licenced']: if ppi_dataset['licenced'] or pdi_dataset['licenced']:
filename += "_licenced" filename += "_licenced"
id_space = task_hook.parameters["config"].get("identifier", "symbol")
filename = os.path.join(task_hook.data_directory, filename+".gt") filename = os.path.join(task_hook.data_directory, filename+".gt")
# g, seed_ids, viral_protein_ids, drug_ids = read_graph_tool_graph(file_path, seeds, datasets, ignored_edge_types, max_deg, ignore_non_seed_baits, False, include_non_approved_drugs) # g, seed_ids, viral_protein_ids, drug_ids = read_graph_tool_graph(file_path, seeds, datasets, ignored_edge_types, max_deg, ignore_non_seed_baits, False, include_non_approved_drugs)
g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, max_deg, False, include_non_approved_drugs, search_target) g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, id_space, max_deg, False, include_non_approved_drugs, search_target)
# Set number of threads if OpenMP support is enabled. # Set number of threads if OpenMP support is enabled.
if gt.openmp_enabled(): if gt.openmp_enabled():
......
...@@ -105,11 +105,15 @@ def multi_steiner(task_hook: TaskHook): ...@@ -105,11 +105,15 @@ def multi_steiner(task_hook: TaskHook):
# Parsing input file. # Parsing input file.
task_hook.set_progress(0 / (float(num_trees + 3)), "Parsing input.") task_hook.set_progress(0 / (float(num_trees + 3)), "Parsing input.")
id_space = task_hook.parameters["config"].get("identifier", "symbol")
filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}" filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}"
if ppi_dataset['licenced'] or pdi_dataset['licenced']: if ppi_dataset['licenced'] or pdi_dataset['licenced']:
filename += "_licenced" filename += "_licenced"
filename = os.path.join(task_hook.data_directory, filename+".gt") filename = os.path.join(task_hook.data_directory, filename+".gt")
g, seed_ids, _ = read_graph_tool_graph(filename, seeds, max_deg, target=search_target) g, seed_ids, _ = read_graph_tool_graph(filename, seeds, id_space, max_deg, target=search_target)
# seed_map = {g.vertex_properties["name"][node]: node for node in seed_ids} # seed_map = {g.vertex_properties["name"][node]: node for node in seed_ids}
seed_map = {g.vertex_properties[node_name_attribute][node]: node for node in seed_ids} seed_map = {g.vertex_properties[node_name_attribute][node]: node for node in seed_ids}
task_hook.set_progress(1 / (float(num_trees + 3)), "Computing edge weights.") task_hook.set_progress(1 / (float(num_trees + 3)), "Computing edge weights.")
......
...@@ -86,12 +86,15 @@ def network_proximity(task_hook: TaskHook): ...@@ -86,12 +86,15 @@ def network_proximity(task_hook: TaskHook):
# Parsing input file. # Parsing input file.
task_hook.set_progress(0.0 / 8, "Parsing input.") task_hook.set_progress(0.0 / 8, "Parsing input.")
id_space = task_hook.parameters["config"].get("identifier", "symbol")
filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}" filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}"
if ppi_dataset['licenced'] or pdi_dataset['licenced']: if ppi_dataset['licenced'] or pdi_dataset['licenced']:
filename += "_licenced" filename += "_licenced"
filename = os.path.join(task_hook.data_directory, filename+".gt") filename = os.path.join(task_hook.data_directory, filename+".gt")
# g, seed_ids, _, drug_ids = read_graph_tool_graph(file_path, seeds, "", "", max_deg, False, True, include_non_approved_drugs) # g, seed_ids, _, drug_ids = read_graph_tool_graph(file_path, seeds, "", "", max_deg, False, True, include_non_approved_drugs)
g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, max_deg, True, include_non_approved_drugs, target=search_target) g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, id_space, max_deg, True, include_non_approved_drugs, target=search_target)
# Computing edge weights. # Computing edge weights.
task_hook.set_progress(1.0 / 8, "Computing edge weights.") task_hook.set_progress(1.0 / 8, "Computing edge weights.")
weights = edge_weights(g, hub_penalty) weights = edge_weights(g, hub_penalty)
......
...@@ -198,11 +198,14 @@ def trust_rank(task_hook: TaskHook): ...@@ -198,11 +198,14 @@ def trust_rank(task_hook: TaskHook):
# Parsing input file. # Parsing input file.
task_hook.set_progress(0 / 4.0, "Parsing input.") task_hook.set_progress(0 / 4.0, "Parsing input.")
id_space = task_hook.parameters["config"].get("identifier", "symbol")
filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}" filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}"
if ppi_dataset['licenced'] or pdi_dataset['licenced']: if ppi_dataset['licenced'] or pdi_dataset['licenced']:
filename += "_licenced" filename += "_licenced"
filename = os.path.join(task_hook.data_directory, filename+".gt") filename = os.path.join(task_hook.data_directory, filename+".gt")
g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, max_deg, include_indirect_drugs, include_non_approved_drugs, search_target) g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, id_space, max_deg, include_indirect_drugs, include_non_approved_drugs, search_target)
task_hook.set_progress(1 / 4.0, "Computing edge weights.") task_hook.set_progress(1 / 4.0, "Computing edge weights.")
weights = edge_weights(g, hub_penalty, inverse=True) weights = edge_weights(g, hub_penalty, inverse=True)
......
...@@ -3,7 +3,7 @@ import graph_tool.topology as gtt ...@@ -3,7 +3,7 @@ import graph_tool.topology as gtt
# def read_graph_tool_graph(file_path, seeds, datasets, ignored_edge_types, max_deg, ignore_non_seed_baits=False, include_indirect_drugs=False, include_non_approved_drugs=False): # def read_graph_tool_graph(file_path, seeds, datasets, ignored_edge_types, max_deg, ignore_non_seed_baits=False, include_indirect_drugs=False, include_non_approved_drugs=False):
def read_graph_tool_graph(file_path, seeds, max_deg, include_indirect_drugs=False, include_non_approved_drugs=False, def read_graph_tool_graph(file_path, seeds,id_space, max_deg, include_indirect_drugs=False, include_non_approved_drugs=False,
target='drug'): target='drug'):
r"""Reads a graph-tool graph from file. r"""Reads a graph-tool graph from file.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment