Newer
Older
from tasks.util.custom_edges import add_edges
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from tasks.util.read_graph_tool_graph import read_graph_tool_graph
from tasks.util.edge_weights import edge_weights
import os.path
import graph_tool as gt
import graph_tool.topology as gtt
import sys
import numpy as np
def network_proximity(task_hook: TaskHook):
# Type: List of str
# Semantics: Names of the seed proteins. Use UNIPROT AC for host proteins, and
# names of the format SARS_CoV2_<IDENTIFIER> (tree_edge.g., SARS_CoV2_ORF6) for
# virus proteins.
# Reasonable default: None, has to be selected by user via "select for analysis"
# utility in frontend.
# Acceptable values: UNIPROT ACs, identifiers of viral proteins.
seeds = task_hook.parameters["seeds"]
nodes_not_in_lcc = {'D3W0D1', 'O15178', 'O60542', 'O60609', 'O60882', 'Q5T4W7', 'Q6UVW9', 'Q6UW32', 'Q6UWQ7',
'Q6UXB1', 'Q7RTX0', 'Q7RTX1', 'Q8TE23', 'Q9GZZ7', 'Q9H2W2', 'Q9H665', 'Q9NZW4'}
# seeds = [node for node in set(seeds).difference(nodes_not_in_lcc)]
# Type: bool
# Semantics: Sepcifies whether should be included in the analysis when ranking drugs.
# Example: False.
# Reasonable default: False.
# Has no effect unless trust_rank.py is used for ranking drugs.
include_non_approved_drugs = task_hook.parameters.get("include_non_approved_drugs", False)
# Type: int.
# Semantics: Number of random seed sets for computing Z-scores.
# Example: 32.
# Reasonable default: 32.
# Acceptable values: Positive integers.
num_random_seed_sets = task_hook.parameters.get("num_random_seed_sets", 32)
# Type: int.
# Semantics: Number of random drug target sets for computing Z-scores.
# Example: 32.
# Reasonable default: 32.
# Acceptable values: Positive integers.
num_random_drug_target_sets = task_hook.parameters.get("num_random_drug_target_sets", 32)
# Type: int.
# Semantics: Number of returned drugs.
# Example: 20.
# Reasonable default: 20.
# Acceptable values: integers n with n > 0.
result_size = task_hook.parameters.get("result_size", 20)
# Type: int.
# Semantics: All nodes with degree > max_deg * g.num_vertices() are ignored.
# Example: 39.
# Reasonable default: sys.maxsize.
# Acceptable values: Positive integers.
max_deg = task_hook.parameters.get("max_deg", sys.maxsize)
# Type: float.
# Semantics: Penalty parameter for hubs. Set edge weight to 1 + (hub_penalty / 2) (e.source.degree + e.target.degree)
# Example: 0.5.
# Reasonable default: 0.
# Acceptable values: Floats between 0 and 1.
hub_penalty = task_hook.parameters.get("hub_penalty", 0.0)
# Type: int.
# Semantics: Number of threads used for running the analysis.
# Example: 1.
# Reasonable default: 1.
# Note: We probably do not want to expose this parameter to the user.
num_threads = task_hook.parameters.get("num_threads", 1)
ppi_dataset = task_hook.parameters.get("ppi_dataset")
pdi_dataset = task_hook.parameters.get("pdi_dataset")
search_target = task_hook.parameters.get("target", "drug-target")
filter_paths = task_hook.parameters.get("filter_paths", True)
custom_edges = task_hook.parameters.get("custom_edges", False)
node_name_attribute = "internal_id" # nodes in the input network which is created from RepoTrialDB have primaryDomainId as name attribute
# Set number of threads if OpenMP support is enabled.
if gt.openmp_enabled():
gt.openmp_set_num_threads(num_threads)
# Parsing input file.
task_hook.set_progress(0.0 / 8, "Parsing input.")
id_space = task_hook.parameters["config"].get("identifier", "symbol")
filename = f"{id_space}_{ppi_dataset['name']}-{pdi_dataset['name']}"
if ppi_dataset['licenced'] or pdi_dataset['licenced']:
filename += "_licenced"
filename = os.path.join(task_hook.data_directory, filename + ".gt")
# g, seed_ids, _, drug_ids = read_graph_tool_graph(file_path, seeds, "", "", max_deg, False, True, include_non_approved_drugs)
g, seed_ids, drug_ids = read_graph_tool_graph(filename, seeds, id_space, max_deg, True, include_non_approved_drugs, target=search_target)
if custom_edges:
edges = task_hook.parameters.get("input_network")['edges']
g = add_edges(g, edges)
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
# Computing edge weights.
task_hook.set_progress(1.0 / 8, "Computing edge weights.")
weights = edge_weights(g, hub_penalty)
# Delete drug targets not in LCC.
task_hook.set_progress(2.0 / 8, "Deleting drug targets not in LCC.")
drug_targets = {drug_id: g.get_all_neighbors(drug_id) for drug_id in drug_ids}
for drug_id in drug_ids:
deleted_targets = []
targets = drug_targets[drug_id]
for i in range(len(targets)):
# if g.vertex_properties["name"][targets[i]] in nodes_not_in_lcc:
if g.vertex_properties[node_name_attribute][targets[i]] in nodes_not_in_lcc:
deleted_targets.append(i)
drug_targets[drug_id] = np.delete(targets, deleted_targets)
# Compute all shortest path distances.
task_hook.set_progress(3.0 / 8, "Computing all shortest path distances.")
distances = None
if hub_penalty == 0:
# distances = np.load(os.path.join(task_hook.data_directory, "host-host-distances.npy")) # need to create this file and also rename it to protein-protein-distances.npy
distances = gtt.shortest_distance(g)
else:
distances = gtt.shortest_distance(g, weights=weights)
# Compute network proximities.
task_hook.set_progress(4.0 / 8, "Computing network proximities.")
proximities = {drug_id : 0 for drug_id in drug_ids}
for drug_id in drug_ids:
distance = 0.0
for drug_target in drug_targets[drug_id]:
distance += min([distances[drug_target][seed_id] for seed_id in seed_ids])
if len(drug_targets[drug_id]) == 0:
proximities[drug_id] = np.inf
else:
proximities[drug_id] = distance / float(len(drug_targets[drug_id]))
# Compute background distribution.
task_hook.set_progress(5.0 / 8, "Computing background distribution")
min_num_targets = min([len(drug_targets[drug_id]) for drug_id in drug_ids])
max_num_targets = max([len(drug_targets[drug_id]) for drug_id in drug_ids])
node_ids_in_lcc = [node for node in range(g.num_vertices()) if
# not g.vertex_properties["name"][node] in nodes_not_in_lcc]
not g.vertex_properties[node_name_attribute][node] in nodes_not_in_lcc]
background_distribution = []
num_seeds = len(seed_ids)
for i in range(num_random_seed_sets):
np.random.shuffle(node_ids_in_lcc)
random_seed_ids = node_ids_in_lcc[:num_seeds]
for k in range(num_random_drug_target_sets):
np.random.shuffle(node_ids_in_lcc)
random_drug_targets = node_ids_in_lcc[:np.random.randint(min_num_targets, max_num_targets + 1)]
distance = 0.0
for drug_target in random_drug_targets:
distance += min([distances[drug_target][seed_id] for seed_id in random_seed_ids])
background_distribution.append(distance / float(len(random_drug_targets)))
background_mean = np.mean(background_distribution)
background_std = np.std(background_distribution)
# Apply Z-score transformation.
task_hook.set_progress(6.0 / 8, "Applying Z-score transformation.")
drugs_with_z_scores = [(drug_id, (proximities[drug_id] - background_mean) / background_std) for drug_id in drug_ids]
task_hook.set_progress(7.0 / 8, "Formatting results.")
drugs_with_z_scores = [(drug_id, (proximities[drug_id] - background_mean) / background_std) for drug_id in drug_ids]
task_hook.set_progress(7.0 / 8, "Formatting results.")
best_drugs = [item for item in sorted(drugs_with_z_scores, key=lambda item: item[1])[:result_size]]
best_drugs_ids = [item[0] for item in best_drugs]
seed_ids = list(set(seed_ids))
# Concatenate best result candidates with seeds and compute induced subgraph.
# since the result size filters out nodes, the result network is not complete anymore.
# Therefore, it is necessary to find the shortest paths to the found nodes in case intermediate nodes have been removed.
# This leads to more nodes in the result that given in the threshold, but the added intermediate nodes help to understand the output
# and are marked as intermediate nodes.
intermediate_nodes = set()
returned_edges = set()
returned_nodes = set(seed_ids) # return seed_ids in any case
# return only the path to a drug with the shortest distance
if filter_paths:
for candidate in best_drugs_ids:
distances = gtt.shortest_distance(g, candidate, seed_ids)
closest_distance_mean = sum(distances) / len(distances)
for index, seed_id in enumerate(seed_ids):
if distances[index] > closest_distance_mean:
continue
vertices, edges = gtt.shortest_path(g, candidate, seed_id)
drug_in_path = False
for vertex in vertices:
if g.vertex_properties["type"][int(vertex)] == "Drug" and vertex != candidate:
drug_in_path = True
break
if drug_in_path:
continue
for vertex in vertices:
if int(vertex) not in returned_nodes:
# inserting intermediate node in order to make result comprehensive
intermediate_nodes.add(g.vertex_properties[node_name_attribute][int(vertex)])
returned_nodes.add(int(vertex))
for edge in edges:
if ((edge.source(), edge.target()) not in returned_edges) or ((edge.target(), edge.source()) not in returned_edges):
returned_edges.add((edge.source(), edge.target()))
else:
for candidate in best_drugs_ids:
for index, seed_id in enumerate(seed_ids):
vertices, edges = gtt.shortest_path(g, candidate, seed_id)
drug_in_path = False
for vertex in vertices:
if g.vertex_properties["type"][int(vertex)] == "Drug" and vertex != candidate:
drug_in_path = True
break
if drug_in_path:
continue
for vertex in vertices:
if int(vertex) not in returned_nodes:
# inserting intermediate node in order to make result comprehensive
intermediate_nodes.add(g.vertex_properties[node_name_attribute][int(vertex)])
returned_nodes.add(int(vertex))
for edge in edges:
if ((edge.source(), edge.target()) not in returned_edges) or ((edge.target(), edge.source()) not in returned_edges):
returned_edges.add((edge.source(), edge.target()))
subgraph = {
"nodes": [g.vertex_properties[node_name_attribute][node] for node in returned_nodes],
"edges": [{"from": g.vertex_properties[node_name_attribute][source], "to": g.vertex_properties[node_name_attribute][target]} for source, target in returned_edges],
}
# Compute node attributes.
node_types = {g.vertex_properties[node_name_attribute][node]: g.vertex_properties["type"][node] for node in returned_nodes}
is_seed = {g.vertex_properties[node_name_attribute][node]: node in set(seed_ids) for node in returned_nodes}
returned_scores = {g.vertex_properties[node_name_attribute][node]: None for node in returned_nodes}
for node, score in best_drugs:
# returned_scores[g.vertex_properties["name"][node]] = score
returned_scores[g.vertex_properties[node_name_attribute][node]] = score
task_hook.set_results({
"network": subgraph,
'intermediate_nodes': list(intermediate_nodes),
"node_attributes":
{
"node_types": node_types,
"is_seed": is_seed,
"scores": returned_scores
},
'gene_interaction_dataset': ppi_dataset,
'drug_interaction_dataset': pdi_dataset,
})