Skip to content
Snippets Groups Projects
Commit 763031b2 authored by AndiMajore's avatar AndiMajore
Browse files

adjusted id-space wise analysis preparation and execution

Former-commit-id: dd709695
parent f40b48a2
No related branches found
No related tags found
No related merge requests found
from collections import defaultdict
from typing import List, Tuple
import graph_tool.all as gt
from drugstone import models
......@@ -77,7 +78,7 @@ def _internal_ppis(dataset) -> List[models.ProteinProteinInteraction]:
return node_node_interaction_objects
def create_gt(params: Tuple) -> None:
def create_gt(params: List[str]) -> None:
"""Fetches all required information to build a graph-tools file for given
PPI and PDI dataset names (params). Builds the graph-tools file and saves it in
the data/Networks folder.
......@@ -85,37 +86,31 @@ def create_gt(params: Tuple) -> None:
Args:
params (Tuple[str, str]): Protein-protein-dataset name, Protein-drug-dataset name
"""
ppi_dataset, pdi_dataset = params
ppi_dataset, pdi_dataset, identifier = params
licensed = ppi_dataset.licenced or pdi_dataset.licenced
# get data from api
g = gt.Graph(directed=False)
e_type = g.new_edge_property("string")
v_type = g.new_vertex_property("string")
v_name = g.new_vertex_property("string")
v_drugstone_id = g.new_vertex_property("string")
v_has_symbol = g.new_vertex_property("bool")
v_has_entrez = g.new_vertex_property("bool")
v_has_ensembl = g.new_vertex_property("bool")
v_expression = g.new_vertex_property("string")
# for drugs
v_status = g.new_vertex_property("string")
v_drug_id = g.new_vertex_property("string")
v_internal_id = g.new_vertex_property("string")
g.edge_properties["type"] = e_type
g.edge_properties["drugstone_id"] = e_type
# g.edge_properties["drugstone_id"] = e_type
g.vertex_properties["type"] = v_type
g.vertex_properties["name"] = v_name
g.vertex_properties["drugstone_id"] = v_drugstone_id
g.vertex_properties["has_symbol"] = v_has_symbol
g.vertex_properties["has_entrez"] = v_has_entrez
g.vertex_properties["has_ensembl"] = v_has_ensembl
g.vertex_properties["status"] = v_status
g.vertex_properties["drug_id"] = v_drug_id
g.vertex_properties["expression"] = v_expression
g.vertex_properties["internal_id"] = v_internal_id
# store nodes to connect them when creating edges
vertices = {}
......@@ -123,21 +118,46 @@ def create_gt(params: Tuple) -> None:
# add vertices
# print("adding nodes")
print(f'loading nodes')
print(f'loading nodes for {identifier}')
# extend node data by cancer nodes, we create a normal node for each cancer node.
# on reading the data, we decide which one to keep based on the user selected cancer types
has_ensembl_set = {node.protein_id for node in models.EnsemblGene.objects.all()}
is_entrez = identifier == 'entrez'
is_symbol = identifier == 'symbol'
is_uniprot = identifier == 'uniprot'
is_ensg = identifier == 'ensg'
if is_ensg:
ensembl_set = defaultdict(set)
for node in models.EnsemblGene.objects.all():
ensembl_set[node.protein_id].add(node.name)
node_id_map = defaultdict(set)
drugstone_ids_to_node_ids = defaultdict(set)
for node in models.Protein.objects.all():
if is_entrez:
if len(node.entrez) != 0:
node_id_map[node.entrez].add(node.id)
drugstone_ids_to_node_ids[node.id].add(node.entrez)
elif is_symbol:
if len(node.gene) != 0:
node_id_map[node.gene].add(node.id)
drugstone_ids_to_node_ids[node.id].add(node.gene)
elif is_uniprot:
node_id_map[node.uniprot_code].add(node.id)
drugstone_ids_to_node_ids[node.id].add(node.uniprot_code)
elif is_ensg:
for id in ensembl_set[node.id]:
node_id_map[id].add(node.id)
drugstone_ids_to_node_ids[node.id].add(id)
for id, nodes in node_id_map.items():
v = g.add_vertex()
v_type[v] = 'protein'
v_drugstone_id[v] = f"p{node.id}"
v_has_symbol[v] = len(node.gene) != 0
v_has_entrez[v] = len(node.entrez) != 0
v_has_ensembl[v] = node.id in has_ensembl_set
vertices[node.id] = v
v_internal_id[v] = id
for drugstone_id in nodes:
vertices[drugstone_id] = v
print("done with nodes")
print(f"adding drugs")
......@@ -145,22 +165,42 @@ def create_gt(params: Tuple) -> None:
v = g.add_vertex()
v_type[v] = 'drug'
v_status[v] = node.status
v_drugstone_id[v] = f'dr{node.id}'
v_internal_id[v] = f'dr{node.id}'
drug_vertices[node.id] = v
print("done with drugs")
# add edges
print(f'adding ppi_edges/{ppi_dataset}')
uniq_edges = set()
for edge_raw in _internal_ppis(ppi_dataset):
e = g.add_edge(vertices[edge_raw.from_protein_id], vertices[edge_raw.to_protein_id])
e_type[e] = 'protein-protein'
id1 = edge_raw.from_protein_id
id2 = edge_raw.to_protein_id
if id1 > id2:
tmp = id1
id1 = id2
id2 = tmp
hash = f'{id1}_{id2}'
if hash not in uniq_edges and id1 in vertices and id2 in vertices:
uniq_edges.add(hash)
e = g.add_edge(vertices[id1], vertices[id2])
e_type[e] = 'protein-protein'
print("done with edges")
uniq_edges = set()
print(f'loading drug_edges/{pdi_dataset}')
for edge_raw in _internal_pdis(pdi_dataset):
e = g.add_edge(drug_vertices[edge_raw.drug_id], vertices[edge_raw.protein_id])
e_type[e] = 'drug-protein'
id1 = edge_raw.drug_id
id2 = edge_raw.protein_id
hash = f'{id1}_{id2}'
if hash not in uniq_edges and id1 in drug_vertices and id2 in vertices:
uniq_edges.add(hash)
e = g.add_edge(drug_vertices[id1], vertices[id2])
e_type[e] = 'drug-protein'
print("done with drug edges")
# remove unconnected proteins
......@@ -177,7 +217,7 @@ def create_gt(params: Tuple) -> None:
g.remove_vertex(reversed(sorted(delete_vertices)), fast=True)
# save graph
filename = f"./data/Networks/internal_{ppi_dataset.name}_{pdi_dataset.name}"
filename = f"./data/Networks/{identifier}_{ppi_dataset.name}-{pdi_dataset.name}"
if licensed:
filename += "_licenced"
filename += ".gt"
......@@ -195,11 +235,25 @@ class Command(BaseCommand):
pdi_datasets = models.PDIDataset.objects.all()
licenced_ppi_dataset = {ppi.name: ppi for ppi in ppi_datasets if ppi.licenced}
licenced_pdi_dataset = {pdi.name: pdi for pdi in pdi_datasets if pdi.licenced}
uniq_combis = set()
parameter_combinations = []
for protein_interaction_dataset in ppi_datasets:
for pdi_dataset in pdi_datasets:
parameter_combinations.append((protein_interaction_dataset, pdi_dataset))
licenced = protein_interaction_dataset.licenced or pdi_dataset.licenced
if licenced:
protein_interaction_dataset = licenced_ppi_dataset[
protein_interaction_dataset.name] if protein_interaction_dataset.name in licenced_ppi_dataset else protein_interaction_dataset
pdi_dataset = licenced_pdi_dataset[
pdi_dataset.name] if pdi_dataset.name in licenced_pdi_dataset else pdi_dataset
hash = f'{protein_interaction_dataset.name}-{pdi_dataset.name}_{licenced}'
if hash in uniq_combis:
continue
uniq_combis.add(hash)
for identifier in ['ensg', 'symbol', 'ensembl', 'uniprot']:
parameter_combinations.append([protein_interaction_dataset, pdi_dataset, identifier])
# close all database connections so subprocesses will create their own connections
# this prevents the processes from running into problems because of using the same connection
db.connections.close_all()
......
......@@ -215,20 +215,20 @@ def populate(kwargs):
print(f'Populated {n} DrDi associations from DrugBank.')
if kwargs['protein_protein']:
print('Importing PPIs from unlicenced NeDRexDB...')
n = NedrexImporter.import_protein_protein_interactions(importer,
DatasetLoader.get_ppi_nedrex(nedrex_api_url_unlicenced, False),
update)
total_n += n
print(f'Imported {n} PPIs from unlicended NeDRexDB')
print('Importing PPIs from licenced NeDRexDB...')
n = NedrexImporter.import_protein_protein_interactions(importer,
DatasetLoader.get_ppi_nedrex(nedrex_api_url_licenced,
True),
update)
total_n += n
nedrex_update = True
print(f'Imported {n} PPIs from licended NeDRexDB')
# print('Importing PPIs from unlicenced NeDRexDB...')
# n = NedrexImporter.import_protein_protein_interactions(importer,
# DatasetLoader.get_ppi_nedrex(nedrex_api_url_unlicenced, False),
# update)
# total_n += n
# print(f'Imported {n} PPIs from unlicended NeDRexDB')
# print('Importing PPIs from licenced NeDRexDB...')
# n = NedrexImporter.import_protein_protein_interactions(importer,
# DatasetLoader.get_ppi_nedrex(nedrex_api_url_licenced,
# True),
# update)
# total_n += n
# nedrex_update = True
# print(f'Imported {n} PPIs from licended NeDRexDB')
print('Populating PPIs from STRING...')
n = DataPopulator.populate_ppi_string(populator, DatasetLoader.get_ppi_string(), update)
total_n += n
......
......@@ -78,11 +78,9 @@ class TaskView(APIView):
licenced = parameters.get('licenced', False)
# find databases based on parameter strings
print(get_ppi_ds(parameters.get('ppi_dataset', DEFAULTS['ppi']), licenced))
parameters['ppi_dataset'] = PPIDatasetSerializer().to_representation(
get_ppi_ds(parameters.get('ppi_dataset', DEFAULTS['ppi']), licenced))
print(get_pdi_ds(parameters.get('pdi_dataset', DEFAULTS['pdi']), licenced))
parameters['pdi_dataset'] = PDIDatasetSerializer().to_representation(
get_pdi_ds(parameters.get('pdi_dataset', DEFAULTS['pdi']), licenced))
......@@ -177,10 +175,6 @@ def map_nodes(request) -> Response:
nodes_mapped, id_key = query_proteins_by_identifier(node_ids, identifier)
# change data structure to dict in order to be quicker when merging
# if identifier == 'ensg':
# # a protein might have multiple ensg-numbers, unpack these into single nodes
# nodes_mapped_dict = {node_id: node for node in nodes_mapped for node_id in node[id_key]}
# else:
nodes_mapped_dict = {node[id_key][0]: node for node in nodes_mapped}
# merge fetched data with given data to avoid data loss
......@@ -265,10 +259,14 @@ def result_view(request) -> Response:
drugs = []
network = result['network']
node_types = {}
node_attributes['node_types'] = node_types
is_seed = {}
node_attributes['is_seed'] = is_seed
node_types = node_attributes.get('node_types')
if not node_types:
node_types = {}
node_attributes['node_types'] = node_types
is_seed = node_attributes.get('is_seed')
if not is_seed:
is_seed = {}
node_attributes['is_seed'] = is_seed
scores = node_attributes.get('scores', {})
node_details = {}
protein_id_map = defaultdict(set)
......@@ -286,7 +284,7 @@ def result_view(request) -> Response:
# merge input network with result network
for node in parameters['input_network']['nodes']:
# if node was already mapped, add user defined values to result of analysis
if identifier in identifier_nodes:
if identifier in node:
node_name = node[identifier][0]
if node_name in node_details:
# update the node to not lose user input attributes
......@@ -310,12 +308,32 @@ def result_view(request) -> Response:
result['node_attributes']['node_types'][node_id] = 'custom'
# extend the analysis network by the input netword nodes
# map edge endpoints to database proteins if possible and add edges to analysis network
protein_nodes = set()
# mapping all new protein and drug nodes by drugstoneIDs + adding scores
for node_id in nodes:
if node_id[:2] == 'dr':
node_data = DrugSerializer().to_representation(Drug.objects.get(id=int(node_id[2:])))
node_data['drugstoneType'] = 'drug'
drugs.append(node_data)
if node_id in scores:
node_data['score'] = scores.get(node_id, None)
node_types[node_id] = 'drug'
node_details[node_id] = node_data
elif node_id[:2] != 'di':
protein_nodes.add(node_id)
else:
continue
nodes_mapped, _ = query_proteins_by_identifier(protein_nodes, identifier)
nodes_mapped_dict = {node[identifier][0]: node for node in nodes_mapped}
if node_id[0] == 'p':
node_data = ProteinNodeSerializer().to_representation(Protein.objects.get(id=int(node_id[1:])))
# merge fetched data with given data to avoid data loss
for node_id in nodes:
if node_id in nodes_mapped_dict:
# node.update(nodes_mapped_dict[node['id']])
node_data = nodes_mapped_dict[node_id]
node_data['drugstoneType'] = 'protein'
# proteins.append(node_data)
node_ident = node_data[identifier][0]
# node_data[identifier] = [node_ident]
......@@ -326,36 +344,20 @@ def result_view(request) -> Response:
score = scores.get(node_id, None)
if node_ident in node_details:
data = node_details[node_ident]
data['entrez'].extend(node_data['entrez'])
data['ensg'].extend(node_data['ensg'])
data['symbol'].extend(node_data['symbol'])
data['uniprot_ac'].extend(node_data['uniprot_ac'])
if score:
if 'score' in data:
data['score'].append(score)
else:
data['score'] = [score] if score else []
data['score'] = [score] if score else None
else:
node_data['score'] = [score] if score else []
node_data['score'] = score if score else None
node_data['drugstoneType'] = 'protein'
node_data['id'] = node_ident
node_data['label'] = node_ident
node_details[node_ident] = node_data
elif node_id[:2] == 'dr':
node_data = DrugSerializer().to_representation(Drug.objects.get(id=int(node_id[2:])))
drugs.append(node_data)
if node_id in scores:
node_data['score'] = scores.get(node_id, None)
node_types[node_id] = 'drug'
node_details[node_id] = node_data
else:
continue
for node_id, detail in node_details.items():
detail['symbol'] = list(set(detail['symbol']))
detail['entrez'] = list(set(detail['entrez']))
detail['uniprot_ac'] = list(set(detail['uniprot_ac']))
detail['ensg'] = list(set(detail['ensg']))
if 'drugstoneType' in detail and detail['drugstoneType'] == 'protein':
detail['symbol'] = list(set(detail['symbol']))
detail['entrez'] = list(set(detail['entrez']))
detail['uniprot_ac'] = list(set(detail['uniprot_ac']))
detail['ensg'] = list(set(detail['ensg']))
edges = parameters['input_network']['edges']
edge_endpoint_ids = set()
......@@ -389,7 +391,12 @@ def result_view(request) -> Response:
map(lambda n: {"from": f'p{n.from_protein_id}', "to": f'p{n.to_protein_id}'}, interaction_objects))
edges.extend(auto_edges)
result['network']['edges'].extend(edges)
result['network']['nodes'] = list(identifier_nodes)
# uniq_edges = dict()
# for edge in result['network']['edges']:
# hash = edge['from'] + edge['to']
# uniq_edges[hash] = edge
# result['network']['edges']=list(uniq_edges.values())
# result['network']['nodes'] = list(identifier_nodes)
if 'scores' in result['node_attributes']:
del result['node_attributes']['scores']
......
......@@ -172,15 +172,13 @@ def betweenness_centrality(task_hook: TaskHook):
id_space = task_hook.parameters["config"].get("identifier","symbol")
print(id_space)
# Parsing input file.
task_hook.set_progress(0 / 3.0, "Parsing input.")
filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}"
filename = f"{id_space}_{ppi_dataset['name']}-{pdi_dataset['name']}"
if ppi_dataset['licenced'] or pdi_dataset['licenced']:
filename += "_licenced"
filename = os.path.join(task_hook.data_directory, filename+".gt")
g, seed_ids, id_space, drug_ids = read_graph_tool_graph(
filename = os.path.join(task_hook.data_directory, filename + ".gt")
g, seed_ids, drug_ids = read_graph_tool_graph(
filename,
seeds,
max_deg,
......
......@@ -170,13 +170,13 @@ def closeness_centrality(task_hook: TaskHook):
# Parsing input file.
task_hook.set_progress(0 / 4.0, "Parsing input.")
filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}"
if ppi_dataset['licenced'] or pdi_dataset['licenced']:
filename += "_licenced"
id_space = task_hook.parameters["config"].get("identifier", "symbol")
filename = os.path.join(task_hook.data_directory, filename+".gt")
filename = f"{id_space}_{ppi_dataset['name']}-{pdi_dataset['name']}"
if ppi_dataset['licenced'] or pdi_dataset['licenced']:
filename += "_licenced"
filename = os.path.join(task_hook.data_directory, filename + ".gt")
# g, seed_ids, viral_protein_ids, drug_ids = read_graph_tool_graph(file_path, seeds, datasets, ignored_edge_types, max_deg, ignore_non_seed_baits, include_indirect_drugs, include_non_approved_drugs)
g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, id_space, max_deg, include_indirect_drugs, include_non_approved_drugs, search_target)
task_hook.set_progress(1 / 4.0, "Computing edge weights.")
......
......@@ -150,13 +150,13 @@ def degree_centrality(task_hook: TaskHook):
# Parsing input file.
task_hook.set_progress(0 / 3.0, "Parsing input.")
filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}"
if ppi_dataset['licenced'] or pdi_dataset['licenced']:
filename += "_licenced"
id_space = task_hook.parameters["config"].get("identifier", "symbol")
filename = os.path.join(task_hook.data_directory, filename+".gt")
filename = f"{id_space}_{ppi_dataset['name']}-{pdi_dataset['name']}"
if ppi_dataset['licenced'] or pdi_dataset['licenced']:
filename += "_licenced"
filename = os.path.join(task_hook.data_directory, filename + ".gt")
# g, seed_ids, viral_protein_ids, drug_ids = read_graph_tool_graph(file_path, seeds, datasets, ignored_edge_types, max_deg, ignore_non_seed_baits, False, include_non_approved_drugs)
g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, id_space, max_deg, False, include_non_approved_drugs, search_target)
......
......@@ -10,7 +10,7 @@ import requests
from tasks.task_hook import TaskHook
from drugstone.models import Protein
from drugstone.models import Protein, EnsemblGene
# Base URL
# url = 'http://172.25.0.1:9003/keypathwayminer/requests/'
......@@ -57,9 +57,18 @@ def kpm_task(task_hook: TaskHook):
# --- Fetch and generate the datasets
dataset_name = 'indicatorMatrix'
indicator_matrix_string = ''
protein_backend_ids = [int(seed[1:]) for seed in task_hook.seeds]
proteins = Protein.objects.filter(id__in=protein_backend_ids)
id_space = task_hook.parameters["config"].get("identifier", "symbol")
proteins = []
if id_space == 'symbol':
proteins = Protein.objects.filter(gene__in=task_hook.seeds)
elif id_space == 'entrez':
proteins = Protein.objects.filter(entrez__in=task_hook.seeds)
elif id_space == 'uniprot':
proteins = Protein.objects.filter(uniprot_code__in=task_hook.seeds)
elif id_space == 'ensg':
protein_ids = {ensg.protein_id for ensg in EnsemblGene.objects.filter(name__in=task_hook.seeds)}
proteins = Protein.objects.filter(id__in=protein_ids)
protein_backend_ids = {p.id for p in proteins}
for protein in proteins:
indicator_matrix_string += f'{protein.uniprot_code}\t1\n'
......
......@@ -97,7 +97,7 @@ def multi_steiner(task_hook: TaskHook):
search_target = task_hook.parameters.get("target", "drug-target")
node_name_attribute = "drugstone_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute
node_name_attribute = "internal_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute
# Set number of threads if OpenMP support is enabled.
if gt.openmp_enabled():
......@@ -108,11 +108,10 @@ def multi_steiner(task_hook: TaskHook):
id_space = task_hook.parameters["config"].get("identifier", "symbol")
filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}"
filename = f"{id_space}_{ppi_dataset['name']}-{pdi_dataset['name']}"
if ppi_dataset['licenced'] or pdi_dataset['licenced']:
filename += "_licenced"
filename = os.path.join(task_hook.data_directory, filename+".gt")
filename = os.path.join(task_hook.data_directory, filename + ".gt")
g, seed_ids, _ = read_graph_tool_graph(filename, seeds, id_space, max_deg, target=search_target)
# seed_map = {g.vertex_properties["name"][node]: node for node in seed_ids}
seed_map = {g.vertex_properties[node_name_attribute][node]: node for node in seed_ids}
......
......@@ -89,10 +89,10 @@ def network_proximity(task_hook: TaskHook):
id_space = task_hook.parameters["config"].get("identifier", "symbol")
filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}"
filename = f"{id_space}_{ppi_dataset['name']}-{pdi_dataset['name']}"
if ppi_dataset['licenced'] or pdi_dataset['licenced']:
filename += "_licenced"
filename = os.path.join(task_hook.data_directory, filename+".gt")
filename = os.path.join(task_hook.data_directory, filename + ".gt")
# g, seed_ids, _, drug_ids = read_graph_tool_graph(file_path, seeds, "", "", max_deg, False, True, include_non_approved_drugs)
g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, id_space, max_deg, True, include_non_approved_drugs, target=search_target)
# Computing edge weights.
......
......@@ -201,7 +201,7 @@ def trust_rank(task_hook: TaskHook):
id_space = task_hook.parameters["config"].get("identifier", "symbol")
filename = f"internal_{ppi_dataset['name']}_{pdi_dataset['name']}"
filename = f"{id_space}_{ppi_dataset['name']}-{pdi_dataset['name']}"
if ppi_dataset['licenced'] or pdi_dataset['licenced']:
filename += "_licenced"
filename = os.path.join(task_hook.data_directory, filename+".gt")
......
......@@ -17,7 +17,7 @@ def __dfs_find_bridges(g, node, visited, disc, low, parent, is_bridge):
low[node] = min(low[node], low[nb])
if low[nb] > disc[node]:
is_bridge[g.edge(node, nb)] = True
elif int(nb) != parent[node]: #TODO can in theory be removed because
elif int(nb) != parent[node]: #TODO can in theory be removed
low[node] = min(low[node], disc[nb])
def find_bridges(g):
......
......@@ -3,7 +3,8 @@ import graph_tool.topology as gtt
# def read_graph_tool_graph(file_path, seeds, datasets, ignored_edge_types, max_deg, ignore_non_seed_baits=False, include_indirect_drugs=False, include_non_approved_drugs=False):
def read_graph_tool_graph(file_path, seeds,id_space, max_deg, include_indirect_drugs=False, include_non_approved_drugs=False,
def read_graph_tool_graph(file_path, seeds, id_space, max_deg, include_indirect_drugs=False,
include_non_approved_drugs=False,
target='drug'):
r"""Reads a graph-tool graph from file.
......@@ -45,7 +46,7 @@ def read_graph_tool_graph(file_path, seeds,id_space, max_deg, include_indirect_d
# drug_protein = "DrugHasTarget"
d_type = "drug"
node_name_attribute = "drugstone_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute
node_name_attribute = "internal_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute
# Delete all nodes that are not contained in the selected datasets and have degrees higher than max_deg
deleted_nodes = []
for node in range(g.num_vertices()):
......@@ -59,20 +60,18 @@ def read_graph_tool_graph(file_path, seeds,id_space, max_deg, include_indirect_d
# remove all drugs from graph if we are not looking for drugs
elif target != 'drug' and g.vertex_properties["type"][node] == d_type:
deleted_nodes.append(node)
g.remove_vertex(deleted_nodes, fast=True)
g.remove_vertex(reversed(sorted(deleted_nodes)), fast=True)
# Retrieve internal IDs of seed_ids
seeds = set(seeds)
print(seeds)
seed_ids = {}
drug_ids = []
# is_matched = {protein: False for protein in seeds}
for node in range(g.num_vertices()):
node_type = g.vertex_properties["type"][node]
seed_id = g.vertex_properties[node_name_attribute][node]
if seed_id in seeds:
seed_ids[node] = seed_id
# is_matched[seed_id] = node
if node_type == d_type:
if include_non_approved_drugs:
drug_ids.append(node)
......@@ -81,16 +80,6 @@ def read_graph_tool_graph(file_path, seeds,id_space, max_deg, include_indirect_d
if "approved" in drug_groups:
drug_ids.append(node)
# Check that all seed seeds have been matched and throw error, otherwise.
# print(deleted_nodes)
# print(seed_ids)
# seeds = set(seed_ids.values())
# for (node, seed_id) in seed_ids.items():
# if is_matched[node]
# for protein, found in is_matched.items():
# if not found:
# raise ValueError("Invalid seed protein {}. No node named {} in {}.".format(protein, protein, file_path))
# Delete edges that should be ignored or are not contained in the selected dataset.
deleted_edges = []
......@@ -138,17 +127,11 @@ def read_graph_tool_graph(file_path, seeds,id_space, max_deg, include_indirect_d
for edge in deleted_edges:
g.remove_edge(edge)
g.set_fast_edge_removal(fast=False)
print("Drugs")
print(drug_ids)
print("Vertices")
vertices = 0
for _ in g.vertices():
vertices += 1
print(f'\t{vertices}')
print("Edges")
edges = 0
for _ in g.edges():
edges += 1
print(f'\t{edges}')
# Return the graph and the indices of the seed_ids and the seeds.
return g, list(seed_ids.keys()), drug_ids
......@@ -15,7 +15,7 @@ def scores_to_results(
r"""Transforms the scores to the required result format."""
node_name_attribute = "drugstone_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute
node_name_attribute = "internal_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute
candidates = []
# if strain_or_drugs == "drugs":
if target == "drug":
......@@ -23,8 +23,6 @@ def scores_to_results(
else:
candidates = [(node, scores[node]) for node in range(g.num_vertices()) if scores[node] > 0 and node not in set(seed_ids)]
best_candidates = [item[0] for item in sorted(candidates, key=lambda item: item[1], reverse=True)[:result_size]]
print(f'Candidate list length: {len(best_candidates)}')
# Concatenate best result candidates with seeds and compute induced subgraph.
# since the result size filters out nodes, the result network is not complete anymore.
# Therefore, it is necessary to find the shortest paths to the found nodes in case intermediate nodes have been removed.
......@@ -36,6 +34,7 @@ def scores_to_results(
returned_nodes = set(seed_ids) # return seed_ids in any case
# return only the path to a drug with the shortest distance
accepted_candidates = set()
if filterPaths:
for candidate in best_candidates:
distances = gtt.shortest_distance(g, candidate, seed_ids)
......@@ -53,11 +52,12 @@ def scores_to_results(
break
if drug_in_path:
continue
accepted_candidates.add(g.vertex_properties[node_name_attribute][int(candidate)])
for vertex in vertices:
if int(vertex) not in returned_nodes:
# inserting intermediate node in order to make result comprehensive
intermediate_nodes.add(g.vertex_properties[node_name_attribute][int(vertex)])
if vertex != candidate:
intermediate_nodes.add(g.vertex_properties[node_name_attribute][int(vertex)])
returned_nodes.add(int(vertex))
for edge in edges:
if ((edge.source(), edge.target()) not in returned_edges) or ((edge.target(), edge.source()) not in returned_edges):
......@@ -74,18 +74,21 @@ def scores_to_results(
break
if drug_in_path:
continue
accepted_candidates.add(g.vertex_properties[node_name_attribute][int(candidate)])
for vertex in vertices:
if int(vertex) not in returned_nodes:
# inserting intermediate node in order to make result comprehensive
intermediate_nodes.add(g.vertex_properties[node_name_attribute][int(vertex)])
if vertex != candidate:
intermediate_nodes.add(g.vertex_properties[node_name_attribute][int(vertex)])
returned_nodes.add(int(vertex))
for edge in edges:
if ((edge.source(), edge.target()) not in returned_edges) or ((edge.target(), edge.source()) not in returned_edges):
returned_edges.add((edge.source(), edge.target()))
print(f'Returned nodes number: {len(returned_nodes)}')
for node in accepted_candidates:
if node in intermediate_nodes:
intermediate_nodes.remove(node)
subgraph = {
"nodes": [g.vertex_properties[node_name_attribute][node] for node in returned_nodes],
"nodes":[g.vertex_properties[node_name_attribute][node] for node in returned_nodes],
"edges": [{"from": g.vertex_properties[node_name_attribute][source], "to": g.vertex_properties[node_name_attribute][target]} for source, target in returned_edges],
}
......@@ -97,6 +100,7 @@ def scores_to_results(
return {
"network": subgraph,
'intermediate_nodes': list(intermediate_nodes),
'target_nodes': list(accepted_candidates),
"node_attributes":
{
"node_types": node_types,
......
......@@ -5,7 +5,7 @@ import itertools as it
def steiner_tree(g, seeds, seed_map, weights, non_zero_hub_penalty):
node_name_attribute = "drugstone_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute
node_name_attribute = "internal_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute
mc = gt.Graph(directed=False)
eprop_dist = mc.new_edge_property("int")
mc.ep['dist'] = eprop_dist
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment