Skip to content
Snippets Groups Projects
Commit 456b3260 authored by AndiMajore's avatar AndiMajore
Browse files

handling protein nodes now with multiple ids outside of main id space

Former-commit-id: 50c346aa
parent 043f109e
No related branches found
No related tags found
No related merge requests found
......@@ -14,4 +14,4 @@ DJANGO_SETTINGS_MODULE=drugstone.settings
CELERY_BROKER_URL=redis://redis:6379/0
FLOWER_PORT=8888
FLOWER_BASIC_AUTH=drugstone:test
GT_THREADS=8
\ No newline at end of file
GT_THREADS=2
\ No newline at end of file
......@@ -95,7 +95,9 @@ def create_gt(params: Tuple) -> None:
v_type = g.new_vertex_property("string")
v_name = g.new_vertex_property("string")
v_drugstone_id = g.new_vertex_property("string")
v_entrez = g.new_vertex_property("string")
v_has_symbol = g.new_vertex_property("bool")
v_has_entrez = g.new_vertex_property("bool")
v_has_ensembl = g.new_vertex_property("bool")
v_expression = g.new_vertex_property("string")
# for drugs
......@@ -108,7 +110,9 @@ def create_gt(params: Tuple) -> None:
g.vertex_properties["type"] = v_type
g.vertex_properties["name"] = v_name
g.vertex_properties["drugstone_id"] = v_drugstone_id
g.vertex_properties["entrez"] = v_entrez
g.vertex_properties["has_symbol"] = v_has_symbol
g.vertex_properties["has_entrez"] = v_has_entrez
g.vertex_properties["has_ensembl"] = v_has_ensembl
g.vertex_properties["status"] = v_status
g.vertex_properties["drug_id"] = v_drug_id
g.vertex_properties["expression"] = v_expression
......@@ -122,11 +126,16 @@ def create_gt(params: Tuple) -> None:
print(f'loading nodes')
# extend node data by cancer nodes, we create a normal node for each cancer node.
# on reading the data, we decide which one to keep based on the user selected cancer types
has_ensembl_set = {node.protein_id for node in models.EnsemblGene.objects.all()}
for node in models.Protein.objects.all():
v = g.add_vertex()
v_type[v] = 'protein'
v_drugstone_id[v] = f"p{node.id}"
v_has_symbol[v] = len(node.gene) != 0
v_has_entrez[v] = len(node.entrez) != 0
v_has_ensembl[v] = node.id in has_ensembl_set
vertices[node.id] = v
print("done with nodes")
......@@ -148,7 +157,6 @@ def create_gt(params: Tuple) -> None:
e_type[e] = 'protein-protein'
print("done with edges")
print(f'loading drug_edges/{pdi_dataset}')
for edge_raw in _internal_pdis(pdi_dataset):
e = g.add_edge(drug_vertices[edge_raw.drug_id], vertices[edge_raw.protein_id])
......@@ -161,7 +169,7 @@ def create_gt(params: Tuple) -> None:
if vertex.out_degree() == 0:
delete_vertices.add(vertex)
#remove unconnected drugs
# remove unconnected drugs
for vertex in drug_vertices.values():
if vertex.out_degree() == 0:
delete_vertices.add(vertex)
......
from typing import List, Tuple, Set
from collections import defaultdict
from typing import List, Tuple, Set, OrderedDict
from functools import reduce
from django.db.models import Q
from drugstone.models import Protein, EnsemblGene
......@@ -31,7 +32,8 @@ def query_proteins_by_identifier(node_ids: Set[str], identifier: str) -> Tuple[L
q_list = map(lambda n: Q(uniprot_code__iexact=n), node_ids)
elif identifier == 'ensg':
protein_attribute = 'ensg'
node_ids = map(lambda n: n.protein_id, EnsemblGene.objects.filter(reduce(lambda a,b: a|b, map(lambda n:Q(name__iexact=n),list(node_ids)))))
node_ids = map(lambda n: n.protein_id, EnsemblGene.objects.filter(
reduce(lambda a, b: a | b, map(lambda n: Q(name__iexact=n), list(node_ids)))))
q_list = map(lambda n: Q(id=n), node_ids)
elif identifier == 'entrez':
protein_attribute = 'entrez'
......@@ -40,9 +42,27 @@ def query_proteins_by_identifier(node_ids: Set[str], identifier: str) -> Tuple[L
# node_ids is an empty list
return [], protein_attribute
q_list = reduce(lambda a, b: a | b, q_list)
node_objects = Protein.objects.filter(q_list)
# serialize
nodes = ProteinSerializer(many=True).to_representation(node_objects)
nodes = list()
node_map = defaultdict(list)
for node in ProteinSerializer(many=True).to_representation(node_objects):
node_map[node.get(protein_attribute)].append(node)
for node_id, entries in node_map.items():
nodes.append(aggregate_nodes(entries))
return nodes, protein_attribute
def aggregate_nodes(nodes: List[OrderedDict]):
node = defaultdict(set)
for n in nodes:
for key, value in n.items():
if isinstance(value,list):
for e in value:
node[key].add(e)
else:
node[key].add(value)
return {k: list(v) for k, v in node.items()}
......@@ -58,7 +58,7 @@ def get_pdis_ds(source, licenced):
def get_drdis_ds(source, licenced):
try:
ds = models.PDisDataset.objects.filter(name__iexact=source, licenced=licenced).last()
ds = models.PDisDataset.objects.filter(name__iexact=source, licenced=licenced).last()
ds.id
return ds
except:
......@@ -128,7 +128,14 @@ def fetch_edges(request) -> Response:
Response: List of edges which are objects with 'from' and to ' attribtues'
"""
dataset = request.data.get('dataset', DEFAULTS['ppi'])
drugstone_ids = [node['drugstone_id'][1:] for node in request.data.get('nodes', '[]') if 'drugstone_id' in node]
drugstone_ids = set()
for node in request.data.get('nodes', '[]'):
if 'drugstone_id' in node:
if isinstance(node['drugstone_id'], list):
for id in node['drugstone_id']:
drugstone_ids.add(id[1:])
else:
drugstone_ids.add(node['drugstone_id'])
licenced = request.data.get('licenced', False)
dataset_object = get_ppi_ds(dataset, licenced)
interaction_objects = models.ProteinProteinInteraction.objects.filter(
......@@ -168,16 +175,19 @@ def map_nodes(request) -> Response:
nodes_mapped, id_key = query_proteins_by_identifier(node_ids, identifier)
# change data structure to dict in order to be quicker when merging
if identifier == 'ensg':
# a protein might have multiple ensg-numbers, unpack these into single nodes
nodes_mapped_dict = {node_id: node for node in nodes_mapped for node_id in node[id_key]}
else:
nodes_mapped_dict = {node[id_key]: node for node in nodes_mapped}
# if identifier == 'ensg':
# # a protein might have multiple ensg-numbers, unpack these into single nodes
# nodes_mapped_dict = {node_id: node for node in nodes_mapped for node_id in node[id_key]}
# else:
nodes_mapped_dict = {node[id_key][0]: node for node in nodes_mapped}
# merge fetched data with given data to avoid data loss
for node in nodes:
node['drugstoneType'] = 'other'
if node['id'] in nodes_mapped_dict:
node.update(nodes_mapped_dict[node['id']])
node['drugstoneType'] = 'protein'
node['id'] = id_map[node['id']]
# set label to node identifier if label is unset, otherwise
# return list of nodes updated nodes
return Response(nodes)
......@@ -489,6 +499,8 @@ def adjacent_disorders(request) -> Response:
# serialize
edges = DrugDisorderIndicationSerializer(many=True).to_representation(drdi_objects)
disorders = DisorderSerializer(many=True).to_representation(disorders)
for d in disorders:
d['drugstone_type'] = 'disorder'
return Response({
'edges': edges,
'disorders': disorders,
......@@ -514,6 +526,9 @@ def adjacent_drugs(request) -> Response:
# serialize
pdis = ProteinDrugInteractionSerializer(many=True).to_representation(pdi_objects)
drugs = DrugSerializer(many=True).to_representation(drugs)
for drug in drugs:
drug['drugstone_type'] = 'drug'
return Response({
'pdis': pdis,
'drugs': drugs,
......
......@@ -170,13 +170,17 @@ def betweenness_centrality(task_hook: TaskHook):
filterPaths = task_hook.parameters.get("filter_paths", True)
id_space = task_hook.parameters["config"].get("identifier","symbol")
print(id_space)
# Parsing input file.
task_hook.set_progress(0 / 3.0, "Parsing input.")
filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}"
if ppi_dataset['licenced'] or pdi_dataset['licenced']:
filename += "_licenced"
filename = os.path.join(task_hook.data_directory, filename+".gt")
g, seed_ids, drug_ids = read_graph_tool_graph(
g, seed_ids, id_space, drug_ids = read_graph_tool_graph(
filename,
seeds,
max_deg,
......
......@@ -173,9 +173,12 @@ def closeness_centrality(task_hook: TaskHook):
filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}"
if ppi_dataset['licenced'] or pdi_dataset['licenced']:
filename += "_licenced"
id_space = task_hook.parameters["config"].get("identifier", "symbol")
filename = os.path.join(task_hook.data_directory, filename+".gt")
# g, seed_ids, viral_protein_ids, drug_ids = read_graph_tool_graph(file_path, seeds, datasets, ignored_edge_types, max_deg, ignore_non_seed_baits, include_indirect_drugs, include_non_approved_drugs)
g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, max_deg, include_indirect_drugs, include_non_approved_drugs, search_target)
g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, id_space, max_deg, include_indirect_drugs, include_non_approved_drugs, search_target)
task_hook.set_progress(1 / 4.0, "Computing edge weights.")
weights = edge_weights(g, hub_penalty)
......
......@@ -153,9 +153,12 @@ def degree_centrality(task_hook: TaskHook):
filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}"
if ppi_dataset['licenced'] or pdi_dataset['licenced']:
filename += "_licenced"
id_space = task_hook.parameters["config"].get("identifier", "symbol")
filename = os.path.join(task_hook.data_directory, filename+".gt")
# g, seed_ids, viral_protein_ids, drug_ids = read_graph_tool_graph(file_path, seeds, datasets, ignored_edge_types, max_deg, ignore_non_seed_baits, False, include_non_approved_drugs)
g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, max_deg, False, include_non_approved_drugs, search_target)
g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, id_space, max_deg, False, include_non_approved_drugs, search_target)
# Set number of threads if OpenMP support is enabled.
if gt.openmp_enabled():
......
......@@ -105,11 +105,15 @@ def multi_steiner(task_hook: TaskHook):
# Parsing input file.
task_hook.set_progress(0 / (float(num_trees + 3)), "Parsing input.")
id_space = task_hook.parameters["config"].get("identifier", "symbol")
filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}"
if ppi_dataset['licenced'] or pdi_dataset['licenced']:
filename += "_licenced"
filename = os.path.join(task_hook.data_directory, filename+".gt")
g, seed_ids, _ = read_graph_tool_graph(filename, seeds, max_deg, target=search_target)
g, seed_ids, _ = read_graph_tool_graph(filename, seeds, id_space, max_deg, target=search_target)
# seed_map = {g.vertex_properties["name"][node]: node for node in seed_ids}
seed_map = {g.vertex_properties[node_name_attribute][node]: node for node in seed_ids}
task_hook.set_progress(1 / (float(num_trees + 3)), "Computing edge weights.")
......
......@@ -86,12 +86,15 @@ def network_proximity(task_hook: TaskHook):
# Parsing input file.
task_hook.set_progress(0.0 / 8, "Parsing input.")
id_space = task_hook.parameters["config"].get("identifier", "symbol")
filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}"
if ppi_dataset['licenced'] or pdi_dataset['licenced']:
filename += "_licenced"
filename = os.path.join(task_hook.data_directory, filename+".gt")
# g, seed_ids, _, drug_ids = read_graph_tool_graph(file_path, seeds, "", "", max_deg, False, True, include_non_approved_drugs)
g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, max_deg, True, include_non_approved_drugs, target=search_target)
g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, id_space, max_deg, True, include_non_approved_drugs, target=search_target)
# Computing edge weights.
task_hook.set_progress(1.0 / 8, "Computing edge weights.")
weights = edge_weights(g, hub_penalty)
......
......@@ -198,11 +198,14 @@ def trust_rank(task_hook: TaskHook):
# Parsing input file.
task_hook.set_progress(0 / 4.0, "Parsing input.")
id_space = task_hook.parameters["config"].get("identifier", "symbol")
filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}"
if ppi_dataset['licenced'] or pdi_dataset['licenced']:
filename += "_licenced"
filename = os.path.join(task_hook.data_directory, filename+".gt")
g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, max_deg, include_indirect_drugs, include_non_approved_drugs, search_target)
g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, id_space, max_deg, include_indirect_drugs, include_non_approved_drugs, search_target)
task_hook.set_progress(1 / 4.0, "Computing edge weights.")
weights = edge_weights(g, hub_penalty, inverse=True)
......
......@@ -3,7 +3,7 @@ import graph_tool.topology as gtt
# def read_graph_tool_graph(file_path, seeds, datasets, ignored_edge_types, max_deg, ignore_non_seed_baits=False, include_indirect_drugs=False, include_non_approved_drugs=False):
def read_graph_tool_graph(file_path, seeds, max_deg, include_indirect_drugs=False, include_non_approved_drugs=False,
def read_graph_tool_graph(file_path, seeds,id_space, max_deg, include_indirect_drugs=False, include_non_approved_drugs=False,
target='drug'):
r"""Reads a graph-tool graph from file.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment