diff --git a/cami_src/cami_suite.py b/cami_src/cami_suite.py index 889a197fd5e2b8de3efd4de1f5aea67c3573ff97..c068d96798ff0d4f80122df05b3a3223d36b852e 100644 --- a/cami_src/cami_suite.py +++ b/cami_src/cami_suite.py @@ -9,6 +9,23 @@ from configparser import ConfigParser import preprocess from consensus import cami_v1, cami_v2, cami_v3 import matplotlib.pyplot as plt +import itertools + + + +def generate_param_combinations(params_dict): + params_keys = list(params_dict.keys()) + params_values = [params_dict[k] for k in params_keys if k != 'function'] + function_dict = params_dict['function'] + function_names = list(function_dict.keys()) + param_combinations = itertools.product(*params_values) + result = [] + for function_name, function in function_dict.items(): + for combination in param_combinations: + param_dict = dict(zip(params_keys[:-1], combination)) + params_str = '_'.join([f"{k}_{v}".replace(' ', '') for k, v in param_dict.items()]) + result.append([function_name, params_str, {'params': param_dict, 'function': function}]) + return result def initialize_cami(path_to_ppi_file=''): cami_params = {} @@ -298,7 +315,7 @@ class cami(): result_sets[tool] = set(result_list) return result_sets - def create_consensus(self, result_sets): + def create_consensus(self, result_sets, save_output=True): """takes a set of active module predictions and creates a consensus that combines all the results of the different tools. @@ -321,51 +338,83 @@ class cami(): # remove seeds from result sets for tool in result_sets: result_sets[tool] -= set(self.seed_lst) - - params = {'hub_pentalty': [0, 0.25, 0.5, 0.75, 1.0], 'damping_factor': [0.1, 0.25, 0.5, 0.75], 'confidence_level': [0.2, 0.35, 0.5, 0.75], 'ranking':["trustrank", "betweenness", "harmonic"], 'function':[cami_v2.run_cami, cami_v3.run_cami]} - - camis = { - 'union': {'function': cami_v1.make_union, 'params': {}}, - 'intersection': {'function': cami_v1.make_intersection, 'params': {}}, - 'first_neighbors': {'function': cami_v1.make_first_neighbor_result_set, 'params': {}}, - 'cami_v1': {'function': cami_v1.run_cami, 'params': {'consens_threshold': consens_threshold}}, - 'cami_v2_param1_tr': {'function': cami_v2.run_cami, 'params': { - 'hub_penalty': 0.3, 'damping_factor': 0.7, 'confidence_level': 0.5 - }}, - 'cami_v2_param1_b': {'function': cami_v2.run_cami, 'params': { - 'hub_penalty': 0.3, 'damping_factor': 0.7, 'ranking': 'betweenness', 'confidence_level': 0.5 - }}, - 'cami_v2_param1_hc': {'function': cami_v2.run_cami, 'params': { - 'hub_penalty': 0.3, 'damping_factor': 0.7, 'ranking': 'harmonic', 'confidence_level': 0.5 - }}, - 'cami_v2_param2_tr': {'function': cami_v2.run_cami, 'params': { - 'hub_penalty': 0, 'damping_factor': 0.7, 'confidence_level': 0.5 - }}, - 'cami_v2_param2_b': {'function': cami_v2.run_cami, 'params': { - 'hub_penalty': 0, 'damping_factor': 0.7, 'ranking': 'betweenness', 'confidence_level': 0.5 - }}, - 'cami_v2_param2_hc': {'function': cami_v2.run_cami, 'params': { - 'hub_penalty': 0, 'damping_factor': 0.7, 'ranking': 'harmonic', 'confidence_level': 0.5 - }}, - 'cami_v3_param1_tr': {'function': cami_v3.run_cami, 'params': { - 'hub_penalty': 0.3, 'damping_factor': 0.7, 'confidence_level': 0.5 - }}, - 'cami_v3_param1_b': {'function': cami_v3.run_cami, 'params': { - 'hub_penalty': 0.3, 'damping_factor': 0.7, 'ranking': 'betweenness', 'confidence_level': 0.5 - }}, - 'cami_v3_param1_hc': {'function': cami_v3.run_cami, 'params': { - 'hub_penalty': 0.3, 'damping_factor': 0.7, 'ranking': 'harmonic', 'confidence_level': 0.5 - }}, - 'cami_v3_param2_tr': {'function': cami_v3.run_cami, 'params': { - 'hub_penalty': 0, 'damping_factor': 0.7, 'confidence_level': 0.5 - }}, - 'cami_v3_param2_b': {'function': cami_v3.run_cami, 'params': { - 'hub_penalty': 0, 'damping_factor': 0.7, 'ranking': 'betweenness', 'confidence_level': 0.5 - }}, - 'cami_v3_param2_hc': {'function': cami_v3.run_cami, 'params': { - 'hub_penalty': 0, 'damping_factor': 0.7, 'ranking': 'harmonic', 'confidence_level': 0.5 - }}, - } + + params_1 = {'consens_threshold': [consens_threshold], + 'function': {'cami_v1': cami_v1.run_cami}} + + params_0 = {'function': {'union':cami_v1.make_union, + 'intersection':cami_v1.make_intersection, + 'first_neighbours': cami_v1.make_first_neighbor_result_set} + } + + params_tr = {'hub_penalty': [0, 0.25, 0.5, 0.75, 1.0], + 'damping_factor': [0.1, 0.25, 0.5, 0.75], + 'confidence_level': [0.2, 0.35, 0.5, 0.75], + 'ranking': ['trustrank'], + 'function': {'cami_v2': cami_v2.run_cami, + 'cami_v3':cami_v3.run_cami}} + + params_b_m = {'hub_penalty': [0, 0.25, 0.5, 0.75, 1.0], + 'confidence_level': [0.2, 0.35, 0.5, 0.75], + 'ranking': ['betweenness', 'harmonic'], + 'function': {'cami_v2': cami_v2.run_cami, + 'cami_v3':cami_v3.run_cami}} + + cami_setting_list = generate_param_combinations(params_0)+\ + generate_param_combinations(params_1)+\ + generate_param_combinations(params_tr)+\ + generate_param_combinations(params_b_m) + + camis = {} + for setting in cami_setting_list: + if setting[1]: + func_name = setting[0] + '_' +setting[1] + else: + func_name = setting[0] + + camis[func_name] = setting[2] + # camis = { + # 'union': {'function': cami_v1.make_union, 'params': {}}, + # 'intersection': {'function': cami_v1.make_intersection, 'params': {}}, + # 'first_neighbors': {'function': cami_v1.make_first_neighbor_result_set, 'params': {}}, + # 'cami_v1': {'function': cami_v1.run_cami, 'params': {'consens_threshold': consens_threshold}}, + # 'cami_v2_param1_tr': {'function': cami_v2.run_cami, 'params': { + # 'hub_penalty': 0.3, 'damping_factor': 0.7, 'confidence_level': 0.5 + # }}, + # 'cami_v2_param1_b': {'function': cami_v2.run_cami, 'params': { + # 'hub_penalty': 0.3, 'damping_factor': 0.7, 'ranking': 'betweenness', 'confidence_level': 0.5 + # }}, + # 'cami_v2_param1_hc': {'function': cami_v2.run_cami, 'params': { + # 'hub_penalty': 0.3, 'damping_factor': 0.7, 'ranking': 'harmonic', 'confidence_level': 0.5 + # }}, + # 'cami_v2_param2_tr': {'function': cami_v2.run_cami, 'params': { + # 'hub_penalty': 0, 'damping_factor': 0.7, 'confidence_level': 0.5 + # }}, + # 'cami_v2_param2_b': {'function': cami_v2.run_cami, 'params': { + # 'hub_penalty': 0, 'damping_factor': 0.7, 'ranking': 'betweenness', 'confidence_level': 0.5 + # }}, + # 'cami_v2_param2_hc': {'function': cami_v2.run_cami, 'params': { + # 'hub_penalty': 0, 'damping_factor': 0.7, 'ranking': 'harmonic', 'confidence_level': 0.5 + # }}, + # 'cami_v3_param1_tr': {'function': cami_v3.run_cami, 'params': { + # 'hub_penalty': 0.3, 'damping_factor': 0.7, 'confidence_level': 0.5 + # }}, + # 'cami_v3_param1_b': {'function': cami_v3.run_cami, 'params': { + # 'hub_penalty': 0.3, 'damping_factor': 0.7, 'ranking': 'betweenness', 'confidence_level': 0.5 + # }}, + # 'cami_v3_param1_hc': {'function': cami_v3.run_cami, 'params': { + # 'hub_penalty': 0.3, 'damping_factor': 0.7, 'ranking': 'harmonic', 'confidence_level': 0.5 + # }}, + # 'cami_v3_param2_tr': {'function': cami_v3.run_cami, 'params': { + # 'hub_penalty': 0, 'damping_factor': 0.7, 'confidence_level': 0.5 + # }}, + # 'cami_v3_param2_b': {'function': cami_v3.run_cami, 'params': { + # 'hub_penalty': 0, 'damping_factor': 0.7, 'ranking': 'betweenness', 'confidence_level': 0.5 + # }}, + # 'cami_v3_param2_hc': {'function': cami_v3.run_cami, 'params': { + # 'hub_penalty': 0, 'damping_factor': 0.7, 'ranking': 'harmonic', 'confidence_level': 0.5 + # }}, + # } # transform all vertex indices to their corresponding gene names in a result set for tool in result_sets: @@ -405,8 +454,9 @@ class cami(): sys.setrecursionlimit(recursion_limit) # save the results in outputfiles - self.generate_output(cami_method_name, seed_genes, cami_vlist, cami_vertices, putative_vertices, cami_genes, - gene_name_map, codes2tools, cami_scores) + if save_output: + self.generate_output(cami_method_name, seed_genes, cami_vlist, cami_vertices, putative_vertices, cami_genes, + gene_name_map, codes2tools, cami_scores) # add seeds to result sets for drugstone and digest for toolname in self.result_gene_sets: diff --git a/cami_src/consensus/cami_v2.py b/cami_src/consensus/cami_v2.py index 72a02f3b99f4ed0cf7706b07ee8f535e8952efea..5857bd330c5d2b3d6b5dc818e485b6b5e25780c8 100644 --- a/cami_src/consensus/cami_v2.py +++ b/cami_src/consensus/cami_v2.py @@ -8,7 +8,6 @@ from utils.networks import trustrank, betweenness, must, closeness # This uses a trustrank algorithm to rank all putative nodes starting from the seeds and only accepts the top 0.X entries # TODO maybe find a smart way to cutoff automatically? def run_cami(result_sets, ppi_graph, seed_lst, predicted_by, cami_scores, code2toolname, tool_code, params): - damping_factor = params['damping_factor'] hub_penalty = params['hub_penalty'] confidence_level = params.get('confidence_level',0.5) weighted = 'weighted' in params and params['weighted'] @@ -36,6 +35,7 @@ def run_cami(result_sets, ppi_graph, seed_lst, predicted_by, cami_scores, code2t for v, c in counts.items(): weights.a[int(v)] = c if ranking_method == 'trustrank': + damping_factor = params['damping_factor'] scores = trustrank(subnet, seed_lst, damping_factor, hub_penalty, weights) elif ranking_method == 'betweenness': scores = betweenness(subnet, hub_penalty, weights) diff --git a/cami_src/consensus/cami_v3.py b/cami_src/consensus/cami_v3.py index aa6522e6bceb58f70bd4d672afaab27cd9fcd6cf..8326847ed2795851339b9c38804ca74ae29c3321 100644 --- a/cami_src/consensus/cami_v3.py +++ b/cami_src/consensus/cami_v3.py @@ -7,7 +7,6 @@ import graph_tool as gt # This uses a trustrank algorithm to rank all putative nodes starting from the seeds and only accepts the top 0.X entries # TODO maybe find a smart way to cutoff automatically? def run_cami(result_sets, ppi_graph, seed_lst, predicted_by, cami_scores, code2toolname, tool_code, params): - damping_factor = params['damping_factor'] hub_penalty = params['hub_penalty'] confidence_level = params.get('confidence_level', 0.5) weighted = 'weighted' in params and params['weighted'] @@ -39,6 +38,7 @@ def run_cami(result_sets, ppi_graph, seed_lst, predicted_by, cami_scores, code2t weights.a[int(v)] = c if ranking_method == 'trustrank': + damping_factor = params['damping_factor'] scores = trustrank(subnet, seed_lst, damping_factor, hub_penalty, weights) elif ranking_method == 'betweenness': scores = betweenness(subnet, hub_penalty, weights) diff --git a/cami_src/evaluation_scripts/seed_variation_script.py b/cami_src/evaluation_scripts/seed_variation_script.py new file mode 100644 index 0000000000000000000000000000000000000000..3a4c4a04065d8c479448f9e9f77d2474aacd3b2b --- /dev/null +++ b/cami_src/evaluation_scripts/seed_variation_script.py @@ -0,0 +1,226 @@ +import matplotlib.pyplot as plt +import seaborn as sb +import pandas as pd +import os +import random +from cami_suite import cami +import utils.comparison_matrix as comparison_matrix +import numpy as np + +def predict_and_make_consensus(cami, vis=False): + result_sets = cami.make_predictions() + cami.create_consensus(result_sets, save_output=False) + if vis: + n_results = len(cami.result_gene_sets) + cami.visualize_and_save_comparison_matrix() + if vis: + cami.use_nvenn(download=True) + +def make_seedvariation(cami, n_iterations, removal_frac=0.2, vis=False, plot=False): + identifier = cami.uid + base_seeds = cami.origin_seed_lst + original_seeds = [cami.ppi_vertex2gene[seed] for seed in base_seeds] + print(f'All given seeds:{original_seeds}') + + random.seed(50) + removal_frac = removal_frac + nof_iterations = int(n_iterations) + used_tools = list(cami.result_gene_sets.keys()) + nof_seeds = len(base_seeds) + nof_removals = max([int(nof_seeds * removal_frac), 1]) + + redisc_seeds_file = f'{cami.output_dir}/00_seedvariation_rediscovered_seeds.tsv' + result_table_file = f'{cami.output_dir}/00_seedvariation_result_table.tsv' + n_results = len(cami.result_gene_sets) + + redisc_intersection_matrix = pd.DataFrame([[0 for _ in range(n_results)] for __ in range(n_results)], + columns = list(cami.result_gene_sets.keys()), + index = list(cami.result_gene_sets.keys()), + dtype=int) + + with open(redisc_seeds_file, 'w') as redisc_table: + with open(result_table_file, 'w') as res_table: + redisc_table.write('id') + for tool in used_tools: + redisc_table.write(f'\t{tool}') + redisc_table.write('\n') + res_table.write('tool\trdr\trdr_std\tsensitivity\tsensitivity_std\tprecision\tprecision_std\n') + + # result dictionaries of the form {tool:list(value for each iteration)} + + tp_rate_dict = {k:list() for k in used_tools} + redisc_rate_dict = {k:list() for k in used_tools} + module_size_dict = {k:list() for k in used_tools} + + # removed and used seeds per iteration + all_removed_seeds = list() + all_used_seeds = list() + + all_redisc_seeds = [] + + for ident in range(nof_iterations): + redisc_table.write(f'{ident}') + # update uid + new_identifier = identifier + f'_{ident}' + # reset cami + cami.reset_cami(new_uid=new_identifier) +# cami.ppi_graph = original_ppi + + #remove seeds (again) + print(f'Removing {nof_removals} seeds from the original seed list...') + removed_seeds_idx = random.sample(list(range(nof_seeds)), nof_removals) + removed_seeds = cami.remove_seeds(removed_seeds_idx) + rem_seeds = [cami.ppi_vertex2gene[seed] for seed in removed_seeds] + print(f'Removed: {rem_seeds} from the seed list') + print('Updating tools and repeat CAMI') + # reinitialize tools + cami.initialize_all_tools() + + # repeat consensus + if ident%20==0: + predict_and_make_consensus(cami) + else: + predict_and_make_consensus(cami) + + used_seeds = [cami.ppi_vertex2gene[seed] for seed in cami.seed_lst] + + redisc_seeds_dict = {} + result_dict = cami.result_gene_sets + + for tool in result_dict: + nof_predictions = len(result_dict[tool]) + len(used_seeds) + redisc_seeds = set(result_dict[tool]).intersection(set(rem_seeds)) + redisc_prev = len(redisc_seeds) + redisc_rate = redisc_prev / nof_removals + redisc_rate_dict[tool].append(redisc_rate) + redisc_seeds_dict[tool] = redisc_seeds + tp_rate = redisc_prev / len(removed_seeds) + tp_rate_dict[tool].append(tp_rate) + module_size_frac = redisc_prev / nof_predictions + assert module_size_frac <= 1 + module_size_dict[tool].append(module_size_frac) + redisc_table.write('\t') + for idx,seed in enumerate(redisc_seeds): + if idx == 0: + redisc_table.write(f'{list(redisc_seeds)[0]}') + else: + redisc_table.write(f',{seed}') + print(f'{tool} rediscovered {redisc_seeds} after removing {rem_seeds}.') + all_redisc_seeds.append(redisc_seeds_dict) + redisc_table.write('\n') + all_used_seeds.append(used_seeds) + all_removed_seeds.append(rem_seeds) + for algo1 in redisc_seeds_dict: + for algo2 in redisc_seeds_dict: + redisc_intersection_matrix.loc[algo1,algo2] += len(redisc_seeds_dict[algo1].intersection(redisc_seeds_dict[algo2])) + + for tool in redisc_rate_dict: + res_table.write(f'{tool}\t') + res_table.write(f'{np.mean(redisc_rate_dict[tool])}\t') + res_table.write(f'{np.std(redisc_rate_dict[tool])}\t') + res_table.write(f'{np.mean(tp_rate_dict[tool])}\t') + res_table.write(f'{np.std(tp_rate_dict[tool])}\t') + res_table.write(f'{np.mean(module_size_dict[tool])}\t') + res_table.write(f'{np.std(module_size_dict[tool])}\n') + + print(f'Result tables are saved in the following locations:') + + fig1,ax1, fig2,ax2 = comparison_matrix.plot_comparison_matrix(redisc_intersection_matrix, n_rows=cami.nof_tools, + title=f'number of times algorithms rediscovered the same seeds after removing {nof_removals} seeds') + fig1.savefig(f'{cami.output_dir}/same_rediscs_{identifier}_comparison_matrix.png') + fig2.savefig(f'{cami.output_dir}/same_rediscs_{identifier}_comparison_matrix_normalized.png') + # print(variation_results) + # print(rediscovery_rates_results) + tools = [tool for tool in redisc_rate_dict.keys()] + tool_labels = tools.copy() + + for idx,tool in enumerate(tools): + if '_' in tool: + # find the index of the second occurrence of the character + second_occurrence_index = tool.find('_', tool.find('_') + 1) + if second_occurrence_index > -1: + # replace the character at that index with the replacement character + tool_name = tool[:second_occurrence_index] + '\n' + tool[second_occurrence_index + 1:] + tool_labels[idx] = tool_name + if plot: + #PLOT + # Create a figure instance + #print(sys.getrecursionlimit()) + fig1, (ax1, ax5, ax4) = plt.subplots(3, 1, figsize=(20,20)) + fig1.subplots_adjust(left=0.2) + # Extract Figure and Axes instance + + # Create a plot + violins1 = ax1.violinplot([redisc_rate_dict[tool] for tool in tools], showmeans=True, showextrema=True) + + for violinpart in list(violins1.keys())[2:]: + violins1[violinpart].set_color('k') + + for violin, tool in zip(violins1['bodies'], tools): + if tool in [tw.name for tw in cami.tool_wrappers]: + violin.set_facecolor('saddlebrown') + elif tool == 'first_neighbors': + violin.set_facecolor('orange') + elif tool in ['union', 'intersection']: + violin.set_facecolor('peachpuff') + else: + violin.set_facecolor('red') + + # Add title + ax1.set_title(f'Rediscovery rate after randomly removing {nof_removals} seeds {nof_iterations} times from {identifier} seeds.', wrap=True, fontsize=14) + + ax1.set_xticks(list(range(1,len(tools)+1))) + ax1.set_xticklabels(tool_labels) + ax1.tick_params(axis='x', labelsize=11) + + ax1.set_ylabel('Rediscovery rate (<rediscovered seeds>/<removed seeds>)', wrap=True, fontsize=14) + + + violins2 = ax4.violinplot([tp_rate_dict[tool] for tool in tools], showmeans=True, showextrema=True) + for violinpart in list(violins2.keys())[2:]: + violins2[violinpart].set_color('k') + for violin, tool in zip(violins2['bodies'], tools): + if tool in [tw.name for tw in cami.tool_wrappers]: + violin.set_facecolor('tan') + elif tool == 'first_neighbors': + violin.set_facecolor('peachpuff') + elif tool in ['union', 'intersection']: + violin.set_facecolor('orange') + else: + violin.set_facecolor('darkorange') + # Add title + ax4.set_title(f'True positive rates after randomly removing {nof_removals} seeds {nof_iterations} times from {identifier} seeds.', wrap=True, fontsize=14) + + ax4.set_xticks(list(range(1,len(tools)+1))) + ax4.set_xticklabels(tool_labels) + ax4.tick_params(axis='x', labelsize=11) + + ax4.set_ylabel('Sensitivity (TP/TP + FN)', wrap=True, fontsize=14) + + violins3 = ax5.violinplot([module_size_dict[tool] for tool in tools], showmeans=True, showextrema=True) + # Add title + for violinpart in list(violins3.keys())[2:]: + violins3[violinpart].set_color('k') + + for violin, tool in zip(violins3['bodies'], tools): + if tool in [tw.name for tw in cami.tool_wrappers]: + violin.set_facecolor('midnightblue') + elif tool == 'first_neighbors': + violin.set_facecolor('mediumblue') + elif tool in ['union', 'intersection']: + violin.set_facecolor('lightsteelblue') + else: + violin.set_facecolor('royalblue') + + ax5.set_title(f'Ratio of number of rediscovered seeds and predicted module size after removing {nof_removals} seeds {nof_iterations} times from {identifier} seeds.', wrap=True, fontsize=14) + + ax5.set_xticks(list(range(1,len(tools)+1))) + ax5.set_xticklabels(tool_labels) + + ax5.set_ylabel('precision (<rediscovered seeds>/<module size>)', fontsize=14) + ax5.tick_params(axis='x', labelsize=11) + fig1.tight_layout() + fig1.savefig(f'{cami.output_dir}/00_{identifier}_seed_variation_result.png', bbox_inches="tight") + plt.close(fig1) + print(f'Violin plot saved under: 00_{identifier}_seed_variation_result.png') + return cami \ No newline at end of file diff --git a/cami_src/preprocess.py b/cami_src/preprocess.py index 406bf9d681c20de451e6b81f9ce77fd2c4b30eec..f3e2d58b0cb450493dfcb6b3944e438823b7361d 100644 --- a/cami_src/preprocess.py +++ b/cami_src/preprocess.py @@ -144,7 +144,7 @@ def csv2graph(inputfile, if unseen_vertices == 0: break g.vertex_properties["cami_score"] = g.new_vertex_property("float", val=0.0) - values = (50) * [-1] + values = (255) * [-1] g.vertex_properties["predicted_by"] = g.new_vertex_property("vector<int16_t>", val=values) return g