Skip to content
Snippets Groups Projects
Commit 763031b2 authored by AndiMajore's avatar AndiMajore
Browse files

adjusted id-space wise analysis preparation and execution

Former-commit-id: dd709695
parent f40b48a2
No related branches found
No related tags found
No related merge requests found
from collections import defaultdict
from typing import List, Tuple from typing import List, Tuple
import graph_tool.all as gt import graph_tool.all as gt
from drugstone import models from drugstone import models
...@@ -77,7 +78,7 @@ def _internal_ppis(dataset) -> List[models.ProteinProteinInteraction]: ...@@ -77,7 +78,7 @@ def _internal_ppis(dataset) -> List[models.ProteinProteinInteraction]:
return node_node_interaction_objects return node_node_interaction_objects
def create_gt(params: Tuple) -> None: def create_gt(params: List[str]) -> None:
"""Fetches all required information to build a graph-tools file for given """Fetches all required information to build a graph-tools file for given
PPI and PDI dataset names (params). Builds the graph-tools file and saves it in PPI and PDI dataset names (params). Builds the graph-tools file and saves it in
the data/Networks folder. the data/Networks folder.
...@@ -85,37 +86,31 @@ def create_gt(params: Tuple) -> None: ...@@ -85,37 +86,31 @@ def create_gt(params: Tuple) -> None:
Args: Args:
params (Tuple[str, str]): Protein-protein-dataset name, Protein-drug-dataset name params (Tuple[str, str]): Protein-protein-dataset name, Protein-drug-dataset name
""" """
ppi_dataset, pdi_dataset = params ppi_dataset, pdi_dataset, identifier = params
licensed = ppi_dataset.licenced or pdi_dataset.licenced licensed = ppi_dataset.licenced or pdi_dataset.licenced
# get data from api # get data from api
g = gt.Graph(directed=False) g = gt.Graph(directed=False)
e_type = g.new_edge_property("string") e_type = g.new_edge_property("string")
v_type = g.new_vertex_property("string") v_type = g.new_vertex_property("string")
v_name = g.new_vertex_property("string") v_name = g.new_vertex_property("string")
v_drugstone_id = g.new_vertex_property("string")
v_has_symbol = g.new_vertex_property("bool")
v_has_entrez = g.new_vertex_property("bool")
v_has_ensembl = g.new_vertex_property("bool")
v_expression = g.new_vertex_property("string")
# for drugs # for drugs
v_status = g.new_vertex_property("string") v_status = g.new_vertex_property("string")
v_drug_id = g.new_vertex_property("string") v_drug_id = g.new_vertex_property("string")
v_internal_id = g.new_vertex_property("string")
g.edge_properties["type"] = e_type g.edge_properties["type"] = e_type
g.edge_properties["drugstone_id"] = e_type # g.edge_properties["drugstone_id"] = e_type
g.vertex_properties["type"] = v_type g.vertex_properties["type"] = v_type
g.vertex_properties["name"] = v_name g.vertex_properties["name"] = v_name
g.vertex_properties["drugstone_id"] = v_drugstone_id
g.vertex_properties["has_symbol"] = v_has_symbol
g.vertex_properties["has_entrez"] = v_has_entrez
g.vertex_properties["has_ensembl"] = v_has_ensembl
g.vertex_properties["status"] = v_status g.vertex_properties["status"] = v_status
g.vertex_properties["drug_id"] = v_drug_id g.vertex_properties["drug_id"] = v_drug_id
g.vertex_properties["expression"] = v_expression g.vertex_properties["internal_id"] = v_internal_id
# store nodes to connect them when creating edges # store nodes to connect them when creating edges
vertices = {} vertices = {}
...@@ -123,21 +118,46 @@ def create_gt(params: Tuple) -> None: ...@@ -123,21 +118,46 @@ def create_gt(params: Tuple) -> None:
# add vertices # add vertices
# print("adding nodes") # print("adding nodes")
print(f'loading nodes') print(f'loading nodes for {identifier}')
# extend node data by cancer nodes, we create a normal node for each cancer node. # extend node data by cancer nodes, we create a normal node for each cancer node.
# on reading the data, we decide which one to keep based on the user selected cancer types # on reading the data, we decide which one to keep based on the user selected cancer types
has_ensembl_set = {node.protein_id for node in models.EnsemblGene.objects.all()} is_entrez = identifier == 'entrez'
is_symbol = identifier == 'symbol'
is_uniprot = identifier == 'uniprot'
is_ensg = identifier == 'ensg'
if is_ensg:
ensembl_set = defaultdict(set)
for node in models.EnsemblGene.objects.all():
ensembl_set[node.protein_id].add(node.name)
node_id_map = defaultdict(set)
drugstone_ids_to_node_ids = defaultdict(set)
for node in models.Protein.objects.all(): for node in models.Protein.objects.all():
if is_entrez:
if len(node.entrez) != 0:
node_id_map[node.entrez].add(node.id)
drugstone_ids_to_node_ids[node.id].add(node.entrez)
elif is_symbol:
if len(node.gene) != 0:
node_id_map[node.gene].add(node.id)
drugstone_ids_to_node_ids[node.id].add(node.gene)
elif is_uniprot:
node_id_map[node.uniprot_code].add(node.id)
drugstone_ids_to_node_ids[node.id].add(node.uniprot_code)
elif is_ensg:
for id in ensembl_set[node.id]:
node_id_map[id].add(node.id)
drugstone_ids_to_node_ids[node.id].add(id)
for id, nodes in node_id_map.items():
v = g.add_vertex() v = g.add_vertex()
v_type[v] = 'protein' v_type[v] = 'protein'
v_drugstone_id[v] = f"p{node.id}" v_internal_id[v] = id
v_has_symbol[v] = len(node.gene) != 0 for drugstone_id in nodes:
v_has_entrez[v] = len(node.entrez) != 0 vertices[drugstone_id] = v
v_has_ensembl[v] = node.id in has_ensembl_set
vertices[node.id] = v
print("done with nodes") print("done with nodes")
print(f"adding drugs") print(f"adding drugs")
...@@ -145,21 +165,41 @@ def create_gt(params: Tuple) -> None: ...@@ -145,21 +165,41 @@ def create_gt(params: Tuple) -> None:
v = g.add_vertex() v = g.add_vertex()
v_type[v] = 'drug' v_type[v] = 'drug'
v_status[v] = node.status v_status[v] = node.status
v_drugstone_id[v] = f'dr{node.id}' v_internal_id[v] = f'dr{node.id}'
drug_vertices[node.id] = v drug_vertices[node.id] = v
print("done with drugs") print("done with drugs")
# add edges # add edges
print(f'adding ppi_edges/{ppi_dataset}') print(f'adding ppi_edges/{ppi_dataset}')
uniq_edges = set()
for edge_raw in _internal_ppis(ppi_dataset): for edge_raw in _internal_ppis(ppi_dataset):
e = g.add_edge(vertices[edge_raw.from_protein_id], vertices[edge_raw.to_protein_id]) id1 = edge_raw.from_protein_id
id2 = edge_raw.to_protein_id
if id1 > id2:
tmp = id1
id1 = id2
id2 = tmp
hash = f'{id1}_{id2}'
if hash not in uniq_edges and id1 in vertices and id2 in vertices:
uniq_edges.add(hash)
e = g.add_edge(vertices[id1], vertices[id2])
e_type[e] = 'protein-protein' e_type[e] = 'protein-protein'
print("done with edges") print("done with edges")
uniq_edges = set()
print(f'loading drug_edges/{pdi_dataset}') print(f'loading drug_edges/{pdi_dataset}')
for edge_raw in _internal_pdis(pdi_dataset): for edge_raw in _internal_pdis(pdi_dataset):
e = g.add_edge(drug_vertices[edge_raw.drug_id], vertices[edge_raw.protein_id]) id1 = edge_raw.drug_id
id2 = edge_raw.protein_id
hash = f'{id1}_{id2}'
if hash not in uniq_edges and id1 in drug_vertices and id2 in vertices:
uniq_edges.add(hash)
e = g.add_edge(drug_vertices[id1], vertices[id2])
e_type[e] = 'drug-protein' e_type[e] = 'drug-protein'
print("done with drug edges") print("done with drug edges")
...@@ -177,7 +217,7 @@ def create_gt(params: Tuple) -> None: ...@@ -177,7 +217,7 @@ def create_gt(params: Tuple) -> None:
g.remove_vertex(reversed(sorted(delete_vertices)), fast=True) g.remove_vertex(reversed(sorted(delete_vertices)), fast=True)
# save graph # save graph
filename = f"./data/Networks/internal_{ppi_dataset.name}_{pdi_dataset.name}" filename = f"./data/Networks/{identifier}_{ppi_dataset.name}-{pdi_dataset.name}"
if licensed: if licensed:
filename += "_licenced" filename += "_licenced"
filename += ".gt" filename += ".gt"
...@@ -195,11 +235,25 @@ class Command(BaseCommand): ...@@ -195,11 +235,25 @@ class Command(BaseCommand):
pdi_datasets = models.PDIDataset.objects.all() pdi_datasets = models.PDIDataset.objects.all()
licenced_ppi_dataset = {ppi.name: ppi for ppi in ppi_datasets if ppi.licenced}
licenced_pdi_dataset = {pdi.name: pdi for pdi in pdi_datasets if pdi.licenced}
uniq_combis = set()
parameter_combinations = [] parameter_combinations = []
for protein_interaction_dataset in ppi_datasets: for protein_interaction_dataset in ppi_datasets:
for pdi_dataset in pdi_datasets: for pdi_dataset in pdi_datasets:
parameter_combinations.append((protein_interaction_dataset, pdi_dataset)) licenced = protein_interaction_dataset.licenced or pdi_dataset.licenced
if licenced:
protein_interaction_dataset = licenced_ppi_dataset[
protein_interaction_dataset.name] if protein_interaction_dataset.name in licenced_ppi_dataset else protein_interaction_dataset
pdi_dataset = licenced_pdi_dataset[
pdi_dataset.name] if pdi_dataset.name in licenced_pdi_dataset else pdi_dataset
hash = f'{protein_interaction_dataset.name}-{pdi_dataset.name}_{licenced}'
if hash in uniq_combis:
continue
uniq_combis.add(hash)
for identifier in ['ensg', 'symbol', 'ensembl', 'uniprot']:
parameter_combinations.append([protein_interaction_dataset, pdi_dataset, identifier])
# close all database connections so subprocesses will create their own connections # close all database connections so subprocesses will create their own connections
# this prevents the processes from running into problems because of using the same connection # this prevents the processes from running into problems because of using the same connection
db.connections.close_all() db.connections.close_all()
......
...@@ -215,20 +215,20 @@ def populate(kwargs): ...@@ -215,20 +215,20 @@ def populate(kwargs):
print(f'Populated {n} DrDi associations from DrugBank.') print(f'Populated {n} DrDi associations from DrugBank.')
if kwargs['protein_protein']: if kwargs['protein_protein']:
print('Importing PPIs from unlicenced NeDRexDB...') # print('Importing PPIs from unlicenced NeDRexDB...')
n = NedrexImporter.import_protein_protein_interactions(importer, # n = NedrexImporter.import_protein_protein_interactions(importer,
DatasetLoader.get_ppi_nedrex(nedrex_api_url_unlicenced, False), # DatasetLoader.get_ppi_nedrex(nedrex_api_url_unlicenced, False),
update) # update)
total_n += n # total_n += n
print(f'Imported {n} PPIs from unlicended NeDRexDB') # print(f'Imported {n} PPIs from unlicended NeDRexDB')
print('Importing PPIs from licenced NeDRexDB...') # print('Importing PPIs from licenced NeDRexDB...')
n = NedrexImporter.import_protein_protein_interactions(importer, # n = NedrexImporter.import_protein_protein_interactions(importer,
DatasetLoader.get_ppi_nedrex(nedrex_api_url_licenced, # DatasetLoader.get_ppi_nedrex(nedrex_api_url_licenced,
True), # True),
update) # update)
total_n += n # total_n += n
nedrex_update = True # nedrex_update = True
print(f'Imported {n} PPIs from licended NeDRexDB') # print(f'Imported {n} PPIs from licended NeDRexDB')
print('Populating PPIs from STRING...') print('Populating PPIs from STRING...')
n = DataPopulator.populate_ppi_string(populator, DatasetLoader.get_ppi_string(), update) n = DataPopulator.populate_ppi_string(populator, DatasetLoader.get_ppi_string(), update)
total_n += n total_n += n
......
...@@ -78,11 +78,9 @@ class TaskView(APIView): ...@@ -78,11 +78,9 @@ class TaskView(APIView):
licenced = parameters.get('licenced', False) licenced = parameters.get('licenced', False)
# find databases based on parameter strings # find databases based on parameter strings
print(get_ppi_ds(parameters.get('ppi_dataset', DEFAULTS['ppi']), licenced))
parameters['ppi_dataset'] = PPIDatasetSerializer().to_representation( parameters['ppi_dataset'] = PPIDatasetSerializer().to_representation(
get_ppi_ds(parameters.get('ppi_dataset', DEFAULTS['ppi']), licenced)) get_ppi_ds(parameters.get('ppi_dataset', DEFAULTS['ppi']), licenced))
print(get_pdi_ds(parameters.get('pdi_dataset', DEFAULTS['pdi']), licenced))
parameters['pdi_dataset'] = PDIDatasetSerializer().to_representation( parameters['pdi_dataset'] = PDIDatasetSerializer().to_representation(
get_pdi_ds(parameters.get('pdi_dataset', DEFAULTS['pdi']), licenced)) get_pdi_ds(parameters.get('pdi_dataset', DEFAULTS['pdi']), licenced))
...@@ -177,10 +175,6 @@ def map_nodes(request) -> Response: ...@@ -177,10 +175,6 @@ def map_nodes(request) -> Response:
nodes_mapped, id_key = query_proteins_by_identifier(node_ids, identifier) nodes_mapped, id_key = query_proteins_by_identifier(node_ids, identifier)
# change data structure to dict in order to be quicker when merging # change data structure to dict in order to be quicker when merging
# if identifier == 'ensg':
# # a protein might have multiple ensg-numbers, unpack these into single nodes
# nodes_mapped_dict = {node_id: node for node in nodes_mapped for node_id in node[id_key]}
# else:
nodes_mapped_dict = {node[id_key][0]: node for node in nodes_mapped} nodes_mapped_dict = {node[id_key][0]: node for node in nodes_mapped}
# merge fetched data with given data to avoid data loss # merge fetched data with given data to avoid data loss
...@@ -265,8 +259,12 @@ def result_view(request) -> Response: ...@@ -265,8 +259,12 @@ def result_view(request) -> Response:
drugs = [] drugs = []
network = result['network'] network = result['network']
node_types = node_attributes.get('node_types')
if not node_types:
node_types = {} node_types = {}
node_attributes['node_types'] = node_types node_attributes['node_types'] = node_types
is_seed = node_attributes.get('is_seed')
if not is_seed:
is_seed = {} is_seed = {}
node_attributes['is_seed'] = is_seed node_attributes['is_seed'] = is_seed
scores = node_attributes.get('scores', {}) scores = node_attributes.get('scores', {})
...@@ -286,7 +284,7 @@ def result_view(request) -> Response: ...@@ -286,7 +284,7 @@ def result_view(request) -> Response:
# merge input network with result network # merge input network with result network
for node in parameters['input_network']['nodes']: for node in parameters['input_network']['nodes']:
# if node was already mapped, add user defined values to result of analysis # if node was already mapped, add user defined values to result of analysis
if identifier in identifier_nodes: if identifier in node:
node_name = node[identifier][0] node_name = node[identifier][0]
if node_name in node_details: if node_name in node_details:
# update the node to not lose user input attributes # update the node to not lose user input attributes
...@@ -310,12 +308,32 @@ def result_view(request) -> Response: ...@@ -310,12 +308,32 @@ def result_view(request) -> Response:
result['node_attributes']['node_types'][node_id] = 'custom' result['node_attributes']['node_types'][node_id] = 'custom'
# extend the analysis network by the input netword nodes # extend the analysis network by the input netword nodes
# map edge endpoints to database proteins if possible and add edges to analysis network # map edge endpoints to database proteins if possible and add edges to analysis network
protein_nodes = set()
# mapping all new protein and drug nodes by drugstoneIDs + adding scores # mapping all new protein and drug nodes by drugstoneIDs + adding scores
for node_id in nodes: for node_id in nodes:
if node_id[:2] == 'dr':
node_data = DrugSerializer().to_representation(Drug.objects.get(id=int(node_id[2:])))
node_data['drugstoneType'] = 'drug'
drugs.append(node_data)
if node_id in scores:
node_data['score'] = scores.get(node_id, None)
node_types[node_id] = 'drug'
node_details[node_id] = node_data
elif node_id[:2] != 'di':
protein_nodes.add(node_id)
else:
continue
nodes_mapped, _ = query_proteins_by_identifier(protein_nodes, identifier)
nodes_mapped_dict = {node[identifier][0]: node for node in nodes_mapped}
if node_id[0] == 'p': # merge fetched data with given data to avoid data loss
node_data = ProteinNodeSerializer().to_representation(Protein.objects.get(id=int(node_id[1:]))) for node_id in nodes:
if node_id in nodes_mapped_dict:
# node.update(nodes_mapped_dict[node['id']])
node_data = nodes_mapped_dict[node_id]
node_data['drugstoneType'] = 'protein'
# proteins.append(node_data) # proteins.append(node_data)
node_ident = node_data[identifier][0] node_ident = node_data[identifier][0]
# node_data[identifier] = [node_ident] # node_data[identifier] = [node_ident]
...@@ -326,32 +344,16 @@ def result_view(request) -> Response: ...@@ -326,32 +344,16 @@ def result_view(request) -> Response:
score = scores.get(node_id, None) score = scores.get(node_id, None)
if node_ident in node_details: if node_ident in node_details:
data = node_details[node_ident] data = node_details[node_ident]
data['entrez'].extend(node_data['entrez']) data['score'] = [score] if score else None
data['ensg'].extend(node_data['ensg'])
data['symbol'].extend(node_data['symbol'])
data['uniprot_ac'].extend(node_data['uniprot_ac'])
if score:
if 'score' in data:
data['score'].append(score)
else: else:
data['score'] = [score] if score else [] node_data['score'] = score if score else None
else:
node_data['score'] = [score] if score else []
node_data['drugstoneType'] = 'protein' node_data['drugstoneType'] = 'protein'
node_data['id'] = node_ident node_data['id'] = node_ident
node_data['label'] = node_ident node_data['label'] = node_ident
node_details[node_ident] = node_data node_details[node_ident] = node_data
elif node_id[:2] == 'dr':
node_data = DrugSerializer().to_representation(Drug.objects.get(id=int(node_id[2:])))
drugs.append(node_data)
if node_id in scores:
node_data['score'] = scores.get(node_id, None)
node_types[node_id] = 'drug'
node_details[node_id] = node_data
else:
continue
for node_id, detail in node_details.items(): for node_id, detail in node_details.items():
if 'drugstoneType' in detail and detail['drugstoneType'] == 'protein':
detail['symbol'] = list(set(detail['symbol'])) detail['symbol'] = list(set(detail['symbol']))
detail['entrez'] = list(set(detail['entrez'])) detail['entrez'] = list(set(detail['entrez']))
detail['uniprot_ac'] = list(set(detail['uniprot_ac'])) detail['uniprot_ac'] = list(set(detail['uniprot_ac']))
...@@ -389,7 +391,12 @@ def result_view(request) -> Response: ...@@ -389,7 +391,12 @@ def result_view(request) -> Response:
map(lambda n: {"from": f'p{n.from_protein_id}', "to": f'p{n.to_protein_id}'}, interaction_objects)) map(lambda n: {"from": f'p{n.from_protein_id}', "to": f'p{n.to_protein_id}'}, interaction_objects))
edges.extend(auto_edges) edges.extend(auto_edges)
result['network']['edges'].extend(edges) result['network']['edges'].extend(edges)
result['network']['nodes'] = list(identifier_nodes) # uniq_edges = dict()
# for edge in result['network']['edges']:
# hash = edge['from'] + edge['to']
# uniq_edges[hash] = edge
# result['network']['edges']=list(uniq_edges.values())
# result['network']['nodes'] = list(identifier_nodes)
if 'scores' in result['node_attributes']: if 'scores' in result['node_attributes']:
del result['node_attributes']['scores'] del result['node_attributes']['scores']
......
...@@ -172,15 +172,13 @@ def betweenness_centrality(task_hook: TaskHook): ...@@ -172,15 +172,13 @@ def betweenness_centrality(task_hook: TaskHook):
id_space = task_hook.parameters["config"].get("identifier","symbol") id_space = task_hook.parameters["config"].get("identifier","symbol")
print(id_space)
# Parsing input file. # Parsing input file.
task_hook.set_progress(0 / 3.0, "Parsing input.") task_hook.set_progress(0 / 3.0, "Parsing input.")
filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}" filename = f"{id_space}_{ppi_dataset['name']}-{pdi_dataset['name']}"
if ppi_dataset['licenced'] or pdi_dataset['licenced']: if ppi_dataset['licenced'] or pdi_dataset['licenced']:
filename += "_licenced" filename += "_licenced"
filename = os.path.join(task_hook.data_directory, filename + ".gt") filename = os.path.join(task_hook.data_directory, filename + ".gt")
g, seed_ids, id_space, drug_ids = read_graph_tool_graph( g, seed_ids, drug_ids = read_graph_tool_graph(
filename, filename,
seeds, seeds,
max_deg, max_deg,
......
...@@ -170,12 +170,12 @@ def closeness_centrality(task_hook: TaskHook): ...@@ -170,12 +170,12 @@ def closeness_centrality(task_hook: TaskHook):
# Parsing input file. # Parsing input file.
task_hook.set_progress(0 / 4.0, "Parsing input.") task_hook.set_progress(0 / 4.0, "Parsing input.")
filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}"
if ppi_dataset['licenced'] or pdi_dataset['licenced']:
filename += "_licenced"
id_space = task_hook.parameters["config"].get("identifier", "symbol") id_space = task_hook.parameters["config"].get("identifier", "symbol")
filename = f"{id_space}_{ppi_dataset['name']}-{pdi_dataset['name']}"
if ppi_dataset['licenced'] or pdi_dataset['licenced']:
filename += "_licenced"
filename = os.path.join(task_hook.data_directory, filename + ".gt") filename = os.path.join(task_hook.data_directory, filename + ".gt")
# g, seed_ids, viral_protein_ids, drug_ids = read_graph_tool_graph(file_path, seeds, datasets, ignored_edge_types, max_deg, ignore_non_seed_baits, include_indirect_drugs, include_non_approved_drugs) # g, seed_ids, viral_protein_ids, drug_ids = read_graph_tool_graph(file_path, seeds, datasets, ignored_edge_types, max_deg, ignore_non_seed_baits, include_indirect_drugs, include_non_approved_drugs)
g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, id_space, max_deg, include_indirect_drugs, include_non_approved_drugs, search_target) g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, id_space, max_deg, include_indirect_drugs, include_non_approved_drugs, search_target)
......
...@@ -150,12 +150,12 @@ def degree_centrality(task_hook: TaskHook): ...@@ -150,12 +150,12 @@ def degree_centrality(task_hook: TaskHook):
# Parsing input file. # Parsing input file.
task_hook.set_progress(0 / 3.0, "Parsing input.") task_hook.set_progress(0 / 3.0, "Parsing input.")
filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}"
if ppi_dataset['licenced'] or pdi_dataset['licenced']:
filename += "_licenced"
id_space = task_hook.parameters["config"].get("identifier", "symbol") id_space = task_hook.parameters["config"].get("identifier", "symbol")
filename = f"{id_space}_{ppi_dataset['name']}-{pdi_dataset['name']}"
if ppi_dataset['licenced'] or pdi_dataset['licenced']:
filename += "_licenced"
filename = os.path.join(task_hook.data_directory, filename + ".gt") filename = os.path.join(task_hook.data_directory, filename + ".gt")
# g, seed_ids, viral_protein_ids, drug_ids = read_graph_tool_graph(file_path, seeds, datasets, ignored_edge_types, max_deg, ignore_non_seed_baits, False, include_non_approved_drugs) # g, seed_ids, viral_protein_ids, drug_ids = read_graph_tool_graph(file_path, seeds, datasets, ignored_edge_types, max_deg, ignore_non_seed_baits, False, include_non_approved_drugs)
g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, id_space, max_deg, False, include_non_approved_drugs, search_target) g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, id_space, max_deg, False, include_non_approved_drugs, search_target)
......
...@@ -10,7 +10,7 @@ import requests ...@@ -10,7 +10,7 @@ import requests
from tasks.task_hook import TaskHook from tasks.task_hook import TaskHook
from drugstone.models import Protein from drugstone.models import Protein, EnsemblGene
# Base URL # Base URL
# url = 'http://172.25.0.1:9003/keypathwayminer/requests/' # url = 'http://172.25.0.1:9003/keypathwayminer/requests/'
...@@ -57,9 +57,18 @@ def kpm_task(task_hook: TaskHook): ...@@ -57,9 +57,18 @@ def kpm_task(task_hook: TaskHook):
# --- Fetch and generate the datasets # --- Fetch and generate the datasets
dataset_name = 'indicatorMatrix' dataset_name = 'indicatorMatrix'
indicator_matrix_string = '' indicator_matrix_string = ''
protein_backend_ids = [int(seed[1:]) for seed in task_hook.seeds] id_space = task_hook.parameters["config"].get("identifier", "symbol")
proteins = Protein.objects.filter(id__in=protein_backend_ids) proteins = []
if id_space == 'symbol':
proteins = Protein.objects.filter(gene__in=task_hook.seeds)
elif id_space == 'entrez':
proteins = Protein.objects.filter(entrez__in=task_hook.seeds)
elif id_space == 'uniprot':
proteins = Protein.objects.filter(uniprot_code__in=task_hook.seeds)
elif id_space == 'ensg':
protein_ids = {ensg.protein_id for ensg in EnsemblGene.objects.filter(name__in=task_hook.seeds)}
proteins = Protein.objects.filter(id__in=protein_ids)
protein_backend_ids = {p.id for p in proteins}
for protein in proteins: for protein in proteins:
indicator_matrix_string += f'{protein.uniprot_code}\t1\n' indicator_matrix_string += f'{protein.uniprot_code}\t1\n'
......
...@@ -97,7 +97,7 @@ def multi_steiner(task_hook: TaskHook): ...@@ -97,7 +97,7 @@ def multi_steiner(task_hook: TaskHook):
search_target = task_hook.parameters.get("target", "drug-target") search_target = task_hook.parameters.get("target", "drug-target")
node_name_attribute = "drugstone_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute node_name_attribute = "internal_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute
# Set number of threads if OpenMP support is enabled. # Set number of threads if OpenMP support is enabled.
if gt.openmp_enabled(): if gt.openmp_enabled():
...@@ -108,10 +108,9 @@ def multi_steiner(task_hook: TaskHook): ...@@ -108,10 +108,9 @@ def multi_steiner(task_hook: TaskHook):
id_space = task_hook.parameters["config"].get("identifier", "symbol") id_space = task_hook.parameters["config"].get("identifier", "symbol")
filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}" filename = f"{id_space}_{ppi_dataset['name']}-{pdi_dataset['name']}"
if ppi_dataset['licenced'] or pdi_dataset['licenced']: if ppi_dataset['licenced'] or pdi_dataset['licenced']:
filename += "_licenced" filename += "_licenced"
filename = os.path.join(task_hook.data_directory, filename + ".gt") filename = os.path.join(task_hook.data_directory, filename + ".gt")
g, seed_ids, _ = read_graph_tool_graph(filename, seeds, id_space, max_deg, target=search_target) g, seed_ids, _ = read_graph_tool_graph(filename, seeds, id_space, max_deg, target=search_target)
# seed_map = {g.vertex_properties["name"][node]: node for node in seed_ids} # seed_map = {g.vertex_properties["name"][node]: node for node in seed_ids}
......
...@@ -89,7 +89,7 @@ def network_proximity(task_hook: TaskHook): ...@@ -89,7 +89,7 @@ def network_proximity(task_hook: TaskHook):
id_space = task_hook.parameters["config"].get("identifier", "symbol") id_space = task_hook.parameters["config"].get("identifier", "symbol")
filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}" filename = f"{id_space}_{ppi_dataset['name']}-{pdi_dataset['name']}"
if ppi_dataset['licenced'] or pdi_dataset['licenced']: if ppi_dataset['licenced'] or pdi_dataset['licenced']:
filename += "_licenced" filename += "_licenced"
filename = os.path.join(task_hook.data_directory, filename + ".gt") filename = os.path.join(task_hook.data_directory, filename + ".gt")
......
...@@ -201,7 +201,7 @@ def trust_rank(task_hook: TaskHook): ...@@ -201,7 +201,7 @@ def trust_rank(task_hook: TaskHook):
id_space = task_hook.parameters["config"].get("identifier", "symbol") id_space = task_hook.parameters["config"].get("identifier", "symbol")
filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}" filename = f"{id_space}_{ppi_dataset['name']}-{pdi_dataset['name']}"
if ppi_dataset['licenced'] or pdi_dataset['licenced']: if ppi_dataset['licenced'] or pdi_dataset['licenced']:
filename += "_licenced" filename += "_licenced"
filename = os.path.join(task_hook.data_directory, filename+".gt") filename = os.path.join(task_hook.data_directory, filename+".gt")
......
...@@ -17,7 +17,7 @@ def __dfs_find_bridges(g, node, visited, disc, low, parent, is_bridge): ...@@ -17,7 +17,7 @@ def __dfs_find_bridges(g, node, visited, disc, low, parent, is_bridge):
low[node] = min(low[node], low[nb]) low[node] = min(low[node], low[nb])
if low[nb] > disc[node]: if low[nb] > disc[node]:
is_bridge[g.edge(node, nb)] = True is_bridge[g.edge(node, nb)] = True
elif int(nb) != parent[node]: #TODO can in theory be removed because elif int(nb) != parent[node]: #TODO can in theory be removed
low[node] = min(low[node], disc[nb]) low[node] = min(low[node], disc[nb])
def find_bridges(g): def find_bridges(g):
......
...@@ -3,7 +3,8 @@ import graph_tool.topology as gtt ...@@ -3,7 +3,8 @@ import graph_tool.topology as gtt
# def read_graph_tool_graph(file_path, seeds, datasets, ignored_edge_types, max_deg, ignore_non_seed_baits=False, include_indirect_drugs=False, include_non_approved_drugs=False): # def read_graph_tool_graph(file_path, seeds, datasets, ignored_edge_types, max_deg, ignore_non_seed_baits=False, include_indirect_drugs=False, include_non_approved_drugs=False):
def read_graph_tool_graph(file_path, seeds,id_space, max_deg, include_indirect_drugs=False, include_non_approved_drugs=False, def read_graph_tool_graph(file_path, seeds, id_space, max_deg, include_indirect_drugs=False,
include_non_approved_drugs=False,
target='drug'): target='drug'):
r"""Reads a graph-tool graph from file. r"""Reads a graph-tool graph from file.
...@@ -45,7 +46,7 @@ def read_graph_tool_graph(file_path, seeds,id_space, max_deg, include_indirect_d ...@@ -45,7 +46,7 @@ def read_graph_tool_graph(file_path, seeds,id_space, max_deg, include_indirect_d
# drug_protein = "DrugHasTarget" # drug_protein = "DrugHasTarget"
d_type = "drug" d_type = "drug"
node_name_attribute = "drugstone_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute node_name_attribute = "internal_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute
# Delete all nodes that are not contained in the selected datasets and have degrees higher than max_deg # Delete all nodes that are not contained in the selected datasets and have degrees higher than max_deg
deleted_nodes = [] deleted_nodes = []
for node in range(g.num_vertices()): for node in range(g.num_vertices()):
...@@ -59,20 +60,18 @@ def read_graph_tool_graph(file_path, seeds,id_space, max_deg, include_indirect_d ...@@ -59,20 +60,18 @@ def read_graph_tool_graph(file_path, seeds,id_space, max_deg, include_indirect_d
# remove all drugs from graph if we are not looking for drugs # remove all drugs from graph if we are not looking for drugs
elif target != 'drug' and g.vertex_properties["type"][node] == d_type: elif target != 'drug' and g.vertex_properties["type"][node] == d_type:
deleted_nodes.append(node) deleted_nodes.append(node)
g.remove_vertex(deleted_nodes, fast=True)
g.remove_vertex(reversed(sorted(deleted_nodes)), fast=True)
# Retrieve internal IDs of seed_ids # Retrieve internal IDs of seed_ids
seeds = set(seeds) seeds = set(seeds)
print(seeds)
seed_ids = {} seed_ids = {}
drug_ids = [] drug_ids = []
# is_matched = {protein: False for protein in seeds}
for node in range(g.num_vertices()): for node in range(g.num_vertices()):
node_type = g.vertex_properties["type"][node] node_type = g.vertex_properties["type"][node]
seed_id = g.vertex_properties[node_name_attribute][node] seed_id = g.vertex_properties[node_name_attribute][node]
if seed_id in seeds: if seed_id in seeds:
seed_ids[node] = seed_id seed_ids[node] = seed_id
# is_matched[seed_id] = node
if node_type == d_type: if node_type == d_type:
if include_non_approved_drugs: if include_non_approved_drugs:
drug_ids.append(node) drug_ids.append(node)
...@@ -81,16 +80,6 @@ def read_graph_tool_graph(file_path, seeds,id_space, max_deg, include_indirect_d ...@@ -81,16 +80,6 @@ def read_graph_tool_graph(file_path, seeds,id_space, max_deg, include_indirect_d
if "approved" in drug_groups: if "approved" in drug_groups:
drug_ids.append(node) drug_ids.append(node)
# Check that all seed seeds have been matched and throw error, otherwise.
# print(deleted_nodes)
# print(seed_ids)
# seeds = set(seed_ids.values())
# for (node, seed_id) in seed_ids.items():
# if is_matched[node]
# for protein, found in is_matched.items():
# if not found:
# raise ValueError("Invalid seed protein {}. No node named {} in {}.".format(protein, protein, file_path))
# Delete edges that should be ignored or are not contained in the selected dataset. # Delete edges that should be ignored or are not contained in the selected dataset.
deleted_edges = [] deleted_edges = []
...@@ -138,17 +127,11 @@ def read_graph_tool_graph(file_path, seeds,id_space, max_deg, include_indirect_d ...@@ -138,17 +127,11 @@ def read_graph_tool_graph(file_path, seeds,id_space, max_deg, include_indirect_d
for edge in deleted_edges: for edge in deleted_edges:
g.remove_edge(edge) g.remove_edge(edge)
g.set_fast_edge_removal(fast=False) g.set_fast_edge_removal(fast=False)
print("Drugs")
print(drug_ids)
print("Vertices")
vertices = 0 vertices = 0
for _ in g.vertices(): for _ in g.vertices():
vertices += 1 vertices += 1
print(f'\t{vertices}')
print("Edges")
edges = 0 edges = 0
for _ in g.edges(): for _ in g.edges():
edges += 1 edges += 1
print(f'\t{edges}')
# Return the graph and the indices of the seed_ids and the seeds. # Return the graph and the indices of the seed_ids and the seeds.
return g, list(seed_ids.keys()), drug_ids return g, list(seed_ids.keys()), drug_ids
...@@ -15,7 +15,7 @@ def scores_to_results( ...@@ -15,7 +15,7 @@ def scores_to_results(
r"""Transforms the scores to the required result format.""" r"""Transforms the scores to the required result format."""
node_name_attribute = "drugstone_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute node_name_attribute = "internal_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute
candidates = [] candidates = []
# if strain_or_drugs == "drugs": # if strain_or_drugs == "drugs":
if target == "drug": if target == "drug":
...@@ -23,8 +23,6 @@ def scores_to_results( ...@@ -23,8 +23,6 @@ def scores_to_results(
else: else:
candidates = [(node, scores[node]) for node in range(g.num_vertices()) if scores[node] > 0 and node not in set(seed_ids)] candidates = [(node, scores[node]) for node in range(g.num_vertices()) if scores[node] > 0 and node not in set(seed_ids)]
best_candidates = [item[0] for item in sorted(candidates, key=lambda item: item[1], reverse=True)[:result_size]] best_candidates = [item[0] for item in sorted(candidates, key=lambda item: item[1], reverse=True)[:result_size]]
print(f'Candidate list length: {len(best_candidates)}')
# Concatenate best result candidates with seeds and compute induced subgraph. # Concatenate best result candidates with seeds and compute induced subgraph.
# since the result size filters out nodes, the result network is not complete anymore. # since the result size filters out nodes, the result network is not complete anymore.
# Therefore, it is necessary to find the shortest paths to the found nodes in case intermediate nodes have been removed. # Therefore, it is necessary to find the shortest paths to the found nodes in case intermediate nodes have been removed.
...@@ -36,6 +34,7 @@ def scores_to_results( ...@@ -36,6 +34,7 @@ def scores_to_results(
returned_nodes = set(seed_ids) # return seed_ids in any case returned_nodes = set(seed_ids) # return seed_ids in any case
# return only the path to a drug with the shortest distance # return only the path to a drug with the shortest distance
accepted_candidates = set()
if filterPaths: if filterPaths:
for candidate in best_candidates: for candidate in best_candidates:
distances = gtt.shortest_distance(g, candidate, seed_ids) distances = gtt.shortest_distance(g, candidate, seed_ids)
...@@ -53,10 +52,11 @@ def scores_to_results( ...@@ -53,10 +52,11 @@ def scores_to_results(
break break
if drug_in_path: if drug_in_path:
continue continue
accepted_candidates.add(g.vertex_properties[node_name_attribute][int(candidate)])
for vertex in vertices: for vertex in vertices:
if int(vertex) not in returned_nodes: if int(vertex) not in returned_nodes:
# inserting intermediate node in order to make result comprehensive # inserting intermediate node in order to make result comprehensive
if vertex != candidate:
intermediate_nodes.add(g.vertex_properties[node_name_attribute][int(vertex)]) intermediate_nodes.add(g.vertex_properties[node_name_attribute][int(vertex)])
returned_nodes.add(int(vertex)) returned_nodes.add(int(vertex))
for edge in edges: for edge in edges:
...@@ -74,16 +74,19 @@ def scores_to_results( ...@@ -74,16 +74,19 @@ def scores_to_results(
break break
if drug_in_path: if drug_in_path:
continue continue
accepted_candidates.add(g.vertex_properties[node_name_attribute][int(candidate)])
for vertex in vertices: for vertex in vertices:
if int(vertex) not in returned_nodes: if int(vertex) not in returned_nodes:
# inserting intermediate node in order to make result comprehensive # inserting intermediate node in order to make result comprehensive
if vertex != candidate:
intermediate_nodes.add(g.vertex_properties[node_name_attribute][int(vertex)]) intermediate_nodes.add(g.vertex_properties[node_name_attribute][int(vertex)])
returned_nodes.add(int(vertex)) returned_nodes.add(int(vertex))
for edge in edges: for edge in edges:
if ((edge.source(), edge.target()) not in returned_edges) or ((edge.target(), edge.source()) not in returned_edges): if ((edge.source(), edge.target()) not in returned_edges) or ((edge.target(), edge.source()) not in returned_edges):
returned_edges.add((edge.source(), edge.target())) returned_edges.add((edge.source(), edge.target()))
print(f'Returned nodes number: {len(returned_nodes)}') for node in accepted_candidates:
if node in intermediate_nodes:
intermediate_nodes.remove(node)
subgraph = { subgraph = {
"nodes":[g.vertex_properties[node_name_attribute][node] for node in returned_nodes], "nodes":[g.vertex_properties[node_name_attribute][node] for node in returned_nodes],
"edges": [{"from": g.vertex_properties[node_name_attribute][source], "to": g.vertex_properties[node_name_attribute][target]} for source, target in returned_edges], "edges": [{"from": g.vertex_properties[node_name_attribute][source], "to": g.vertex_properties[node_name_attribute][target]} for source, target in returned_edges],
...@@ -97,6 +100,7 @@ def scores_to_results( ...@@ -97,6 +100,7 @@ def scores_to_results(
return { return {
"network": subgraph, "network": subgraph,
'intermediate_nodes': list(intermediate_nodes), 'intermediate_nodes': list(intermediate_nodes),
'target_nodes': list(accepted_candidates),
"node_attributes": "node_attributes":
{ {
"node_types": node_types, "node_types": node_types,
......
...@@ -5,7 +5,7 @@ import itertools as it ...@@ -5,7 +5,7 @@ import itertools as it
def steiner_tree(g, seeds, seed_map, weights, non_zero_hub_penalty): def steiner_tree(g, seeds, seed_map, weights, non_zero_hub_penalty):
node_name_attribute = "drugstone_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute node_name_attribute = "internal_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute
mc = gt.Graph(directed=False) mc = gt.Graph(directed=False)
eprop_dist = mc.new_edge_property("int") eprop_dist = mc.new_edge_property("int")
mc.ep['dist'] = eprop_dist mc.ep['dist'] = eprop_dist
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment