Skip to content
Snippets Groups Projects
Commit 56798bba authored by Le, Mia's avatar Le, Mia
Browse files

edited seedvar for tool comparisons

parent bda13dbd
Branches
No related tags found
No related merge requests found
......@@ -9,6 +9,23 @@ from configparser import ConfigParser
import preprocess
from consensus import cami_v1, cami_v2, cami_v3
import matplotlib.pyplot as plt
import itertools
def generate_param_combinations(params_dict):
params_keys = list(params_dict.keys())
params_values = [params_dict[k] for k in params_keys if k != 'function']
function_dict = params_dict['function']
function_names = list(function_dict.keys())
param_combinations = itertools.product(*params_values)
result = []
for function_name, function in function_dict.items():
for combination in param_combinations:
param_dict = dict(zip(params_keys[:-1], combination))
params_str = '_'.join([f"{k}_{v}".replace(' ', '') for k, v in param_dict.items()])
result.append([function_name, params_str, {'params': param_dict, 'function': function}])
return result
def initialize_cami(path_to_ppi_file=''):
cami_params = {}
......@@ -298,7 +315,7 @@ class cami():
result_sets[tool] = set(result_list)
return result_sets
def create_consensus(self, result_sets):
def create_consensus(self, result_sets, save_output=True):
"""takes a set of active module predictions and creates a consensus
that combines all the results of the different tools.
......@@ -321,51 +338,83 @@ class cami():
# remove seeds from result sets
for tool in result_sets:
result_sets[tool] -= set(self.seed_lst)
params = {'hub_pentalty': [0, 0.25, 0.5, 0.75, 1.0], 'damping_factor': [0.1, 0.25, 0.5, 0.75], 'confidence_level': [0.2, 0.35, 0.5, 0.75], 'ranking':["trustrank", "betweenness", "harmonic"], 'function':[cami_v2.run_cami, cami_v3.run_cami]}
camis = {
'union': {'function': cami_v1.make_union, 'params': {}},
'intersection': {'function': cami_v1.make_intersection, 'params': {}},
'first_neighbors': {'function': cami_v1.make_first_neighbor_result_set, 'params': {}},
'cami_v1': {'function': cami_v1.run_cami, 'params': {'consens_threshold': consens_threshold}},
'cami_v2_param1_tr': {'function': cami_v2.run_cami, 'params': {
'hub_penalty': 0.3, 'damping_factor': 0.7, 'confidence_level': 0.5
}},
'cami_v2_param1_b': {'function': cami_v2.run_cami, 'params': {
'hub_penalty': 0.3, 'damping_factor': 0.7, 'ranking': 'betweenness', 'confidence_level': 0.5
}},
'cami_v2_param1_hc': {'function': cami_v2.run_cami, 'params': {
'hub_penalty': 0.3, 'damping_factor': 0.7, 'ranking': 'harmonic', 'confidence_level': 0.5
}},
'cami_v2_param2_tr': {'function': cami_v2.run_cami, 'params': {
'hub_penalty': 0, 'damping_factor': 0.7, 'confidence_level': 0.5
}},
'cami_v2_param2_b': {'function': cami_v2.run_cami, 'params': {
'hub_penalty': 0, 'damping_factor': 0.7, 'ranking': 'betweenness', 'confidence_level': 0.5
}},
'cami_v2_param2_hc': {'function': cami_v2.run_cami, 'params': {
'hub_penalty': 0, 'damping_factor': 0.7, 'ranking': 'harmonic', 'confidence_level': 0.5
}},
'cami_v3_param1_tr': {'function': cami_v3.run_cami, 'params': {
'hub_penalty': 0.3, 'damping_factor': 0.7, 'confidence_level': 0.5
}},
'cami_v3_param1_b': {'function': cami_v3.run_cami, 'params': {
'hub_penalty': 0.3, 'damping_factor': 0.7, 'ranking': 'betweenness', 'confidence_level': 0.5
}},
'cami_v3_param1_hc': {'function': cami_v3.run_cami, 'params': {
'hub_penalty': 0.3, 'damping_factor': 0.7, 'ranking': 'harmonic', 'confidence_level': 0.5
}},
'cami_v3_param2_tr': {'function': cami_v3.run_cami, 'params': {
'hub_penalty': 0, 'damping_factor': 0.7, 'confidence_level': 0.5
}},
'cami_v3_param2_b': {'function': cami_v3.run_cami, 'params': {
'hub_penalty': 0, 'damping_factor': 0.7, 'ranking': 'betweenness', 'confidence_level': 0.5
}},
'cami_v3_param2_hc': {'function': cami_v3.run_cami, 'params': {
'hub_penalty': 0, 'damping_factor': 0.7, 'ranking': 'harmonic', 'confidence_level': 0.5
}},
}
params_1 = {'consens_threshold': [consens_threshold],
'function': {'cami_v1': cami_v1.run_cami}}
params_0 = {'function': {'union':cami_v1.make_union,
'intersection':cami_v1.make_intersection,
'first_neighbours': cami_v1.make_first_neighbor_result_set}
}
params_tr = {'hub_penalty': [0, 0.25, 0.5, 0.75, 1.0],
'damping_factor': [0.1, 0.25, 0.5, 0.75],
'confidence_level': [0.2, 0.35, 0.5, 0.75],
'ranking': ['trustrank'],
'function': {'cami_v2': cami_v2.run_cami,
'cami_v3':cami_v3.run_cami}}
params_b_m = {'hub_penalty': [0, 0.25, 0.5, 0.75, 1.0],
'confidence_level': [0.2, 0.35, 0.5, 0.75],
'ranking': ['betweenness', 'harmonic'],
'function': {'cami_v2': cami_v2.run_cami,
'cami_v3':cami_v3.run_cami}}
cami_setting_list = generate_param_combinations(params_0)+\
generate_param_combinations(params_1)+\
generate_param_combinations(params_tr)+\
generate_param_combinations(params_b_m)
camis = {}
for setting in cami_setting_list:
if setting[1]:
func_name = setting[0] + '_' +setting[1]
else:
func_name = setting[0]
camis[func_name] = setting[2]
# camis = {
# 'union': {'function': cami_v1.make_union, 'params': {}},
# 'intersection': {'function': cami_v1.make_intersection, 'params': {}},
# 'first_neighbors': {'function': cami_v1.make_first_neighbor_result_set, 'params': {}},
# 'cami_v1': {'function': cami_v1.run_cami, 'params': {'consens_threshold': consens_threshold}},
# 'cami_v2_param1_tr': {'function': cami_v2.run_cami, 'params': {
# 'hub_penalty': 0.3, 'damping_factor': 0.7, 'confidence_level': 0.5
# }},
# 'cami_v2_param1_b': {'function': cami_v2.run_cami, 'params': {
# 'hub_penalty': 0.3, 'damping_factor': 0.7, 'ranking': 'betweenness', 'confidence_level': 0.5
# }},
# 'cami_v2_param1_hc': {'function': cami_v2.run_cami, 'params': {
# 'hub_penalty': 0.3, 'damping_factor': 0.7, 'ranking': 'harmonic', 'confidence_level': 0.5
# }},
# 'cami_v2_param2_tr': {'function': cami_v2.run_cami, 'params': {
# 'hub_penalty': 0, 'damping_factor': 0.7, 'confidence_level': 0.5
# }},
# 'cami_v2_param2_b': {'function': cami_v2.run_cami, 'params': {
# 'hub_penalty': 0, 'damping_factor': 0.7, 'ranking': 'betweenness', 'confidence_level': 0.5
# }},
# 'cami_v2_param2_hc': {'function': cami_v2.run_cami, 'params': {
# 'hub_penalty': 0, 'damping_factor': 0.7, 'ranking': 'harmonic', 'confidence_level': 0.5
# }},
# 'cami_v3_param1_tr': {'function': cami_v3.run_cami, 'params': {
# 'hub_penalty': 0.3, 'damping_factor': 0.7, 'confidence_level': 0.5
# }},
# 'cami_v3_param1_b': {'function': cami_v3.run_cami, 'params': {
# 'hub_penalty': 0.3, 'damping_factor': 0.7, 'ranking': 'betweenness', 'confidence_level': 0.5
# }},
# 'cami_v3_param1_hc': {'function': cami_v3.run_cami, 'params': {
# 'hub_penalty': 0.3, 'damping_factor': 0.7, 'ranking': 'harmonic', 'confidence_level': 0.5
# }},
# 'cami_v3_param2_tr': {'function': cami_v3.run_cami, 'params': {
# 'hub_penalty': 0, 'damping_factor': 0.7, 'confidence_level': 0.5
# }},
# 'cami_v3_param2_b': {'function': cami_v3.run_cami, 'params': {
# 'hub_penalty': 0, 'damping_factor': 0.7, 'ranking': 'betweenness', 'confidence_level': 0.5
# }},
# 'cami_v3_param2_hc': {'function': cami_v3.run_cami, 'params': {
# 'hub_penalty': 0, 'damping_factor': 0.7, 'ranking': 'harmonic', 'confidence_level': 0.5
# }},
# }
# transform all vertex indices to their corresponding gene names in a result set
for tool in result_sets:
......@@ -405,8 +454,9 @@ class cami():
sys.setrecursionlimit(recursion_limit)
# save the results in outputfiles
self.generate_output(cami_method_name, seed_genes, cami_vlist, cami_vertices, putative_vertices, cami_genes,
gene_name_map, codes2tools, cami_scores)
if save_output:
self.generate_output(cami_method_name, seed_genes, cami_vlist, cami_vertices, putative_vertices, cami_genes,
gene_name_map, codes2tools, cami_scores)
# add seeds to result sets for drugstone and digest
for toolname in self.result_gene_sets:
......
......@@ -8,7 +8,6 @@ from utils.networks import trustrank, betweenness, must, closeness
# This uses a trustrank algorithm to rank all putative nodes starting from the seeds and only accepts the top 0.X entries
# TODO maybe find a smart way to cutoff automatically?
def run_cami(result_sets, ppi_graph, seed_lst, predicted_by, cami_scores, code2toolname, tool_code, params):
damping_factor = params['damping_factor']
hub_penalty = params['hub_penalty']
confidence_level = params.get('confidence_level',0.5)
weighted = 'weighted' in params and params['weighted']
......@@ -36,6 +35,7 @@ def run_cami(result_sets, ppi_graph, seed_lst, predicted_by, cami_scores, code2t
for v, c in counts.items():
weights.a[int(v)] = c
if ranking_method == 'trustrank':
damping_factor = params['damping_factor']
scores = trustrank(subnet, seed_lst, damping_factor, hub_penalty, weights)
elif ranking_method == 'betweenness':
scores = betweenness(subnet, hub_penalty, weights)
......
......@@ -7,7 +7,6 @@ import graph_tool as gt
# This uses a trustrank algorithm to rank all putative nodes starting from the seeds and only accepts the top 0.X entries
# TODO maybe find a smart way to cutoff automatically?
def run_cami(result_sets, ppi_graph, seed_lst, predicted_by, cami_scores, code2toolname, tool_code, params):
damping_factor = params['damping_factor']
hub_penalty = params['hub_penalty']
confidence_level = params.get('confidence_level', 0.5)
weighted = 'weighted' in params and params['weighted']
......@@ -39,6 +38,7 @@ def run_cami(result_sets, ppi_graph, seed_lst, predicted_by, cami_scores, code2t
weights.a[int(v)] = c
if ranking_method == 'trustrank':
damping_factor = params['damping_factor']
scores = trustrank(subnet, seed_lst, damping_factor, hub_penalty, weights)
elif ranking_method == 'betweenness':
scores = betweenness(subnet, hub_penalty, weights)
......
import matplotlib.pyplot as plt
import seaborn as sb
import pandas as pd
import os
import random
from cami_suite import cami
import utils.comparison_matrix as comparison_matrix
import numpy as np
def predict_and_make_consensus(cami, vis=False):
result_sets = cami.make_predictions()
cami.create_consensus(result_sets, save_output=False)
if vis:
n_results = len(cami.result_gene_sets)
cami.visualize_and_save_comparison_matrix()
if vis:
cami.use_nvenn(download=True)
def make_seedvariation(cami, n_iterations, removal_frac=0.2, vis=False, plot=False):
identifier = cami.uid
base_seeds = cami.origin_seed_lst
original_seeds = [cami.ppi_vertex2gene[seed] for seed in base_seeds]
print(f'All given seeds:{original_seeds}')
random.seed(50)
removal_frac = removal_frac
nof_iterations = int(n_iterations)
used_tools = list(cami.result_gene_sets.keys())
nof_seeds = len(base_seeds)
nof_removals = max([int(nof_seeds * removal_frac), 1])
redisc_seeds_file = f'{cami.output_dir}/00_seedvariation_rediscovered_seeds.tsv'
result_table_file = f'{cami.output_dir}/00_seedvariation_result_table.tsv'
n_results = len(cami.result_gene_sets)
redisc_intersection_matrix = pd.DataFrame([[0 for _ in range(n_results)] for __ in range(n_results)],
columns = list(cami.result_gene_sets.keys()),
index = list(cami.result_gene_sets.keys()),
dtype=int)
with open(redisc_seeds_file, 'w') as redisc_table:
with open(result_table_file, 'w') as res_table:
redisc_table.write('id')
for tool in used_tools:
redisc_table.write(f'\t{tool}')
redisc_table.write('\n')
res_table.write('tool\trdr\trdr_std\tsensitivity\tsensitivity_std\tprecision\tprecision_std\n')
# result dictionaries of the form {tool:list(value for each iteration)}
tp_rate_dict = {k:list() for k in used_tools}
redisc_rate_dict = {k:list() for k in used_tools}
module_size_dict = {k:list() for k in used_tools}
# removed and used seeds per iteration
all_removed_seeds = list()
all_used_seeds = list()
all_redisc_seeds = []
for ident in range(nof_iterations):
redisc_table.write(f'{ident}')
# update uid
new_identifier = identifier + f'_{ident}'
# reset cami
cami.reset_cami(new_uid=new_identifier)
# cami.ppi_graph = original_ppi
#remove seeds (again)
print(f'Removing {nof_removals} seeds from the original seed list...')
removed_seeds_idx = random.sample(list(range(nof_seeds)), nof_removals)
removed_seeds = cami.remove_seeds(removed_seeds_idx)
rem_seeds = [cami.ppi_vertex2gene[seed] for seed in removed_seeds]
print(f'Removed: {rem_seeds} from the seed list')
print('Updating tools and repeat CAMI')
# reinitialize tools
cami.initialize_all_tools()
# repeat consensus
if ident%20==0:
predict_and_make_consensus(cami)
else:
predict_and_make_consensus(cami)
used_seeds = [cami.ppi_vertex2gene[seed] for seed in cami.seed_lst]
redisc_seeds_dict = {}
result_dict = cami.result_gene_sets
for tool in result_dict:
nof_predictions = len(result_dict[tool]) + len(used_seeds)
redisc_seeds = set(result_dict[tool]).intersection(set(rem_seeds))
redisc_prev = len(redisc_seeds)
redisc_rate = redisc_prev / nof_removals
redisc_rate_dict[tool].append(redisc_rate)
redisc_seeds_dict[tool] = redisc_seeds
tp_rate = redisc_prev / len(removed_seeds)
tp_rate_dict[tool].append(tp_rate)
module_size_frac = redisc_prev / nof_predictions
assert module_size_frac <= 1
module_size_dict[tool].append(module_size_frac)
redisc_table.write('\t')
for idx,seed in enumerate(redisc_seeds):
if idx == 0:
redisc_table.write(f'{list(redisc_seeds)[0]}')
else:
redisc_table.write(f',{seed}')
print(f'{tool} rediscovered {redisc_seeds} after removing {rem_seeds}.')
all_redisc_seeds.append(redisc_seeds_dict)
redisc_table.write('\n')
all_used_seeds.append(used_seeds)
all_removed_seeds.append(rem_seeds)
for algo1 in redisc_seeds_dict:
for algo2 in redisc_seeds_dict:
redisc_intersection_matrix.loc[algo1,algo2] += len(redisc_seeds_dict[algo1].intersection(redisc_seeds_dict[algo2]))
for tool in redisc_rate_dict:
res_table.write(f'{tool}\t')
res_table.write(f'{np.mean(redisc_rate_dict[tool])}\t')
res_table.write(f'{np.std(redisc_rate_dict[tool])}\t')
res_table.write(f'{np.mean(tp_rate_dict[tool])}\t')
res_table.write(f'{np.std(tp_rate_dict[tool])}\t')
res_table.write(f'{np.mean(module_size_dict[tool])}\t')
res_table.write(f'{np.std(module_size_dict[tool])}\n')
print(f'Result tables are saved in the following locations:')
fig1,ax1, fig2,ax2 = comparison_matrix.plot_comparison_matrix(redisc_intersection_matrix, n_rows=cami.nof_tools,
title=f'number of times algorithms rediscovered the same seeds after removing {nof_removals} seeds')
fig1.savefig(f'{cami.output_dir}/same_rediscs_{identifier}_comparison_matrix.png')
fig2.savefig(f'{cami.output_dir}/same_rediscs_{identifier}_comparison_matrix_normalized.png')
# print(variation_results)
# print(rediscovery_rates_results)
tools = [tool for tool in redisc_rate_dict.keys()]
tool_labels = tools.copy()
for idx,tool in enumerate(tools):
if '_' in tool:
# find the index of the second occurrence of the character
second_occurrence_index = tool.find('_', tool.find('_') + 1)
if second_occurrence_index > -1:
# replace the character at that index with the replacement character
tool_name = tool[:second_occurrence_index] + '\n' + tool[second_occurrence_index + 1:]
tool_labels[idx] = tool_name
if plot:
#PLOT
# Create a figure instance
#print(sys.getrecursionlimit())
fig1, (ax1, ax5, ax4) = plt.subplots(3, 1, figsize=(20,20))
fig1.subplots_adjust(left=0.2)
# Extract Figure and Axes instance
# Create a plot
violins1 = ax1.violinplot([redisc_rate_dict[tool] for tool in tools], showmeans=True, showextrema=True)
for violinpart in list(violins1.keys())[2:]:
violins1[violinpart].set_color('k')
for violin, tool in zip(violins1['bodies'], tools):
if tool in [tw.name for tw in cami.tool_wrappers]:
violin.set_facecolor('saddlebrown')
elif tool == 'first_neighbors':
violin.set_facecolor('orange')
elif tool in ['union', 'intersection']:
violin.set_facecolor('peachpuff')
else:
violin.set_facecolor('red')
# Add title
ax1.set_title(f'Rediscovery rate after randomly removing {nof_removals} seeds {nof_iterations} times from {identifier} seeds.', wrap=True, fontsize=14)
ax1.set_xticks(list(range(1,len(tools)+1)))
ax1.set_xticklabels(tool_labels)
ax1.tick_params(axis='x', labelsize=11)
ax1.set_ylabel('Rediscovery rate (<rediscovered seeds>/<removed seeds>)', wrap=True, fontsize=14)
violins2 = ax4.violinplot([tp_rate_dict[tool] for tool in tools], showmeans=True, showextrema=True)
for violinpart in list(violins2.keys())[2:]:
violins2[violinpart].set_color('k')
for violin, tool in zip(violins2['bodies'], tools):
if tool in [tw.name for tw in cami.tool_wrappers]:
violin.set_facecolor('tan')
elif tool == 'first_neighbors':
violin.set_facecolor('peachpuff')
elif tool in ['union', 'intersection']:
violin.set_facecolor('orange')
else:
violin.set_facecolor('darkorange')
# Add title
ax4.set_title(f'True positive rates after randomly removing {nof_removals} seeds {nof_iterations} times from {identifier} seeds.', wrap=True, fontsize=14)
ax4.set_xticks(list(range(1,len(tools)+1)))
ax4.set_xticklabels(tool_labels)
ax4.tick_params(axis='x', labelsize=11)
ax4.set_ylabel('Sensitivity (TP/TP + FN)', wrap=True, fontsize=14)
violins3 = ax5.violinplot([module_size_dict[tool] for tool in tools], showmeans=True, showextrema=True)
# Add title
for violinpart in list(violins3.keys())[2:]:
violins3[violinpart].set_color('k')
for violin, tool in zip(violins3['bodies'], tools):
if tool in [tw.name for tw in cami.tool_wrappers]:
violin.set_facecolor('midnightblue')
elif tool == 'first_neighbors':
violin.set_facecolor('mediumblue')
elif tool in ['union', 'intersection']:
violin.set_facecolor('lightsteelblue')
else:
violin.set_facecolor('royalblue')
ax5.set_title(f'Ratio of number of rediscovered seeds and predicted module size after removing {nof_removals} seeds {nof_iterations} times from {identifier} seeds.', wrap=True, fontsize=14)
ax5.set_xticks(list(range(1,len(tools)+1)))
ax5.set_xticklabels(tool_labels)
ax5.set_ylabel('precision (<rediscovered seeds>/<module size>)', fontsize=14)
ax5.tick_params(axis='x', labelsize=11)
fig1.tight_layout()
fig1.savefig(f'{cami.output_dir}/00_{identifier}_seed_variation_result.png', bbox_inches="tight")
plt.close(fig1)
print(f'Violin plot saved under: 00_{identifier}_seed_variation_result.png')
return cami
\ No newline at end of file
......@@ -144,7 +144,7 @@ def csv2graph(inputfile,
if unseen_vertices == 0:
break
g.vertex_properties["cami_score"] = g.new_vertex_property("float", val=0.0)
values = (50) * [-1]
values = (255) * [-1]
g.vertex_properties["predicted_by"] = g.new_vertex_property("vector<int16_t>", val=values)
return g
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment