From 5e6674cb55ef479802c3fd19c7b37086f43ec6a6 Mon Sep 17 00:00:00 2001 From: bay9355 <mia.le@studium.uni-hamburg.de> Date: Mon, 27 Mar 2023 15:14:30 +0200 Subject: [PATCH] restructured cami --- cami_src/algorithms/AlgorithmWrapper.py | 16 +- cami_src/algorithms/DiamondWrapper.py | 4 +- cami_src/algorithms/DominoWrapper.py | 4 +- cami_src/algorithms/HHotNetWrapper.py | 4 +- cami_src/algorithms/RobustWrapper.py | 4 +- cami_src/algorithms/TemplateWrapper.py | 4 +- cami_src/cami.py | 397 +++--------------------- cami_src/cami_suite.py | 167 +++++----- cami_src/camiconf | 7 + cami_src/consensus/cami_v1.py | 38 ++- cami_src/evaluation/__init__.py | 0 cami_src/preprocess.py | 128 +++++++- cami_src/seed_variationconf | 5 +- 13 files changed, 319 insertions(+), 459 deletions(-) delete mode 100644 cami_src/evaluation/__init__.py diff --git a/cami_src/algorithms/AlgorithmWrapper.py b/cami_src/algorithms/AlgorithmWrapper.py index 17de832..7f51dc5 100644 --- a/cami_src/algorithms/AlgorithmWrapper.py +++ b/cami_src/algorithms/AlgorithmWrapper.py @@ -1,12 +1,13 @@ from graph_tool import Graph import os +from configparser import ConfigParser from numpy import outer class AlgorithmWrapper(object): #TODO: temporary/Working directory als parameter? """Abstract wrapper class for the network enrichment algorithms used in the tests.""" - def __init__(self): + def __init__(self, config): self.uid = '0000' self.name = '' self.weight = 1 @@ -14,13 +15,18 @@ class AlgorithmWrapper(object): self.ppi_network = Graph() self.seeds = list() self.home_path = '' - self.config = 'camiconf' + self.config = config self.code = 99 self.debug = False - def set_weight(self, weight): - self.weight = weight - + def set_weight(self, weight=None): + if weight == None: + config = ConfigParser() + config.read(self.config) + self.weight = float(config.get(str(self.name).lower(), 'toolweight')) + else: + self.weight = weight + def set_config(self, config_file): self.config = config_file diff --git a/cami_src/algorithms/DiamondWrapper.py b/cami_src/algorithms/DiamondWrapper.py index ee61636..35a2636 100644 --- a/cami_src/algorithms/DiamondWrapper.py +++ b/cami_src/algorithms/DiamondWrapper.py @@ -6,8 +6,8 @@ from configparser import ConfigParser import ast class DiamondWrapper(AlgorithmWrapper): - def __init__(self): - super().__init__() + def __init__(self, config): + super().__init__(config) self.name = 'DIAMOnD' self.code = 1 config = ConfigParser() diff --git a/cami_src/algorithms/DominoWrapper.py b/cami_src/algorithms/DominoWrapper.py index 34953a3..2c84acc 100644 --- a/cami_src/algorithms/DominoWrapper.py +++ b/cami_src/algorithms/DominoWrapper.py @@ -8,8 +8,8 @@ from configparser import ConfigParser import ast class DominoWrapper(AlgorithmWrapper): - def __init__(self): - super().__init__() + def __init__(self, config): + super().__init__(config) self.name = 'DOMINO' self.code = 2 config = ConfigParser() diff --git a/cami_src/algorithms/HHotNetWrapper.py b/cami_src/algorithms/HHotNetWrapper.py index 8f5b59b..13f097a 100644 --- a/cami_src/algorithms/HHotNetWrapper.py +++ b/cami_src/algorithms/HHotNetWrapper.py @@ -10,8 +10,8 @@ import itertools as itt class HHotNetWrapper(AlgorithmWrapper): - def __init__(self): - super().__init__() + def __init__(self, config): + super().__init__(config) self.name = 'HHotNet' self.code = 4 diff --git a/cami_src/algorithms/RobustWrapper.py b/cami_src/algorithms/RobustWrapper.py index fb89b20..14677ae 100755 --- a/cami_src/algorithms/RobustWrapper.py +++ b/cami_src/algorithms/RobustWrapper.py @@ -16,8 +16,8 @@ import ast class RobustWrapper(AlgorithmWrapper): - def __init__(self): - super().__init__() + def __init__(self, config): + super().__init__(config) self.name = 'ROBUST' self.code = 3 config = ConfigParser() diff --git a/cami_src/algorithms/TemplateWrapper.py b/cami_src/algorithms/TemplateWrapper.py index 066b679..719aaef 100644 --- a/cami_src/algorithms/TemplateWrapper.py +++ b/cami_src/algorithms/TemplateWrapper.py @@ -4,7 +4,7 @@ import subprocess from configparser import ConfigParser class TemplateWrapper(AlgorithmWrapper): - def __init__(self): + def __init__(self, config): """Each tool needs certain predefined instance variables: The only instance variables that need to be defined for each tool individually are: - name (string): The name of the algorithm/tool @@ -21,7 +21,7 @@ class TemplateWrapper(AlgorithmWrapper): - config (string) There is no need to redefine these variables when introducing a new algorithm. """ - super().__init__() + super().__init__(config) self.name = '<tool_name>' self.code = 4 config = ConfigParser() diff --git a/cami_src/cami.py b/cami_src/cami.py index 3ab225c..9d75ece 100755 --- a/cami_src/cami.py +++ b/cami_src/cami.py @@ -2,68 +2,29 @@ import os import sys import shutil -from algorithms.DiamondWrapper import DiamondWrapper -from algorithms.DominoWrapper import DominoWrapper -from algorithms.RobustWrapper import RobustWrapper -from algorithms.HHotNetWrapper import HHotNetWrapper -from algorithms.AlgorithmWrapper import AlgorithmWrapper import preprocess -import uuid import cami_suite import argparse import webbrowser -import random -import matplotlib.pyplot as plt -import seaborn as sb -import pandas as pd - -#MC: -from configparser import ConfigParser - +import evaluation_scripts.seed_variation_script as sv def main(ppi_network, seeds, tools, tool_weights, consensus, evaluate, output_dir, identifier, save_temps, nvenn, save_image, force, drugstone, ncbi, configuration, - seed_variation, parallelization, external_results, debug): + seed_variation, parallelization, external_results, external_only, debug, comparison_matrix, visualize): print('CAMI started') - config = ConfigParser() - config.read(configuration) - seed_score = config.get('cami', 'seed_score') - nof_tools = len(tools) + len(external_results) - - wrappers = {'diamond': DiamondWrapper(), - 'domino': DominoWrapper(), - 'robust': RobustWrapper(), - 'hotnet': HHotNetWrapper() - } - - tool_wrappers = [wrappers[tool] for tool in tools] - nof_cami_tools = len(wrappers) - - if tool_weights == None: - tool_weights = nof_tools * [1] + # preprocess the inputfiles + if external_only: + ppi_graph, checked_seed_lst, initial_seed_lst, tool_wrappers = preprocess.all(ppi_network, seeds, configuration, [], external_results) + else: + ppi_graph, checked_seed_lst, initial_seed_lst, tool_wrappers = preprocess.all(ppi_network, seeds, configuration, tools, external_results) - ext_wrappers = [] - if len(external_results) > 0: - for idx in range(len(external_results)): - dummy_wrapper = AlgorithmWrapper() - dummy_wrapper.code = idx + 1 + nof_cami_tools - ext_wrappers.append(dummy_wrapper) - - ppi_graph = preprocess.csv2graph(ppi_network, symbol_columns=drugstone, nof_tools=nof_tools) - - seed_lst = preprocess.txt2lst(seeds) - initial_seedlst = seed_lst.copy() - - # rename seeds with their corresponding indices in the ppi_graph - name2index = preprocess.name2index_dict(ppi_graph) - checked_seed_lst = [name2index[v] for v in seed_lst if v in name2index] if debug: print('Created the PPI-network Graph and seed list.') n = len(checked_seed_lst) - m = len(seed_lst) + m = len(initial_seed_lst) if n != m and not force: name = ppi_graph.vertex_properties["name"] - print(f'There are {abs(m-n)} seeds missing in the PPI network: ({set([name[cseed] for cseed in checked_seed_lst]).symmetric_difference(set(initial_seedlst))})') + print(f'There are {abs(m-n)} seeds missing in the PPI network: ({set([name[cseed] for cseed in checked_seed_lst]).symmetric_difference(set(initial_seed_lst))})') choice = input('Continue? [y/n]') if choice == 'y': pass @@ -71,85 +32,33 @@ def main(ppi_network, seeds, tools, tool_weights, consensus, evaluate, print('Aborted because of missing seeds. Please try again.') exit(1) - seed_lst = checked_seed_lst.copy() - if debug: print(f'Continuing with vertices at indices {[int(seed) for seed in seed_lst]} as input seeds') + if debug: print(f'Continuing with vertices at indices {[int(seed) for seed in checked_seed_lst]} as input seeds') - # change directory to ~/cami/cami (home of cami.py) - cami_home = sys.argv[0].rsplit('/', 1) - os.chdir(cami_home[0]) + # change directory to ~/cami/cami_src (home of cami.py) + cami_home = os.path.dirname(os.path.abspath(__file__)) + + os.chdir(cami_home) home_path = os.path.dirname(os.getcwd()) if debug: print(f"Home directory of cami: {home_path}") - if identifier==None: - identifier = str(uuid.uuid4()) - - if output_dir==None: - output_dir = os.path.join(home_path, f'data/output/{identifier}') - output_dir = os.path.abspath(output_dir) - if debug: - print(f"Output directory of cami: {output_dir}") - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - #create temporary directory - tmp_dir = os.path.join(home_path, f'data/tmp/{identifier}') - if debug: - print(f'Creating unique temporary directory for CAMI: {tmp_dir}') - - if not os.path.exists(tmp_dir): - os.makedirs(tmp_dir) - - elif not force: - print(f'TemporaryDirectory {tmp_dir} already exists.') - choice = input('overwrite? [y/n]') - if choice == 'y': - pass - # shutil.rmtree(tmp_dir) - # os.makedirs(tmp_dir) - else: - print('Abort. Please try again.') - exit(1) # initialize CAMI - cami = cami_suite.cami(ppi_graph,seed_lst,tool_wrappers,output_dir,identifier,home_path, - tmp_dir=tmp_dir, config=configuration, seed_score=seed_score, - parallelization=parallelization) - - if ncbi: - cami.ncbi = True + cami = cami_suite.cami(ppi_graph,checked_seed_lst,tool_wrappers,home_path, + initial_seed_lst=initial_seed_lst,uid=identifier, output_dir=output_dir, + configuration=configuration, + parallelization=parallelization, ncbi=ncbi, debug=debug, + save_temps=save_temps, ) - if debug: - cami.debug = True - - for tool in tool_wrappers: - cami.initialize_tool(tool) + if tool_weights is not None: + cami.initialize_tool() - for tool in ext_wrappers: - cami.initialize_tool(tool) - - for idx, tool in enumerate(tool_wrappers): - tool.set_weight(float(tool_weights[idx])) - - cami.set_initial_seed_lst(initial_seedlst) - - if (consensus or (not consensus and not evaluate and not seed_variation)) and not seed_variation: + if consensus or (not consensus and not evaluate) or seed_variation: result_sets = cami.make_predictions() - if len(external_results) > 0: - assert(len(ext_wrappers) == len(external_results)) - external_input = {} - for i in range(len(ext_wrappers)): - external_input[ext_wrappers[i]] = external_results[i] - result_sets = cami.take_custom_results(external_input, result_sets) cami.create_consensus(result_sets) - # adds the seeds to the results, right now result_sets contains the seeds that should be ADDED to the module # for result in result_sets.keys(): # result_sets[result] = result_sets[result].union(cami.seed_lst) - - if nvenn or save_image: + if nvenn or save_image or visualize: print('Sending results to nVenn') url = cami.use_nvenn() if url: @@ -157,254 +66,28 @@ def main(ppi_network, seeds, tools, tool_weights, consensus, evaluate, webbrowser.open(url) if save_image: cami.download_diagram(url) - + + if visualize or comparison_matrix: + cami.visualize_and_save_comparison_matrix() + if drugstone is not None: print('Sending results to DrugstOne') cami.use_drugstone() + if consensus: cami.reset_cami() - if evaluate or (not consensus and not evaluate and not seed_variation): + if evaluate or (not consensus and not evaluate) or seed_variation: cami.make_evaluation() - if evaluate: - cami.reset_cami() - elif (not consensus and not evaluate): - cami.reset_cami() + cami.reset_cami() # SEED VARIATION if seed_variation: - - def predict_and_make_consensus(vis=False): - result_sets = cami.make_predictions() - cami.create_consensus(result_sets) - if vis: - n_results = len(cami.result_gene_sets) - comparison_matrix = pd.DataFrame([[int(0) for _ in range(n_results)] for __ in range(n_results)], - columns = list(cami.result_gene_sets.keys()), - index = list(cami.result_gene_sets.keys()), - dtype=int) - for algo1 in cami.result_gene_sets: - for algo2 in cami.result_gene_sets: - comparison_matrix.loc[algo1,algo2] = int(len(cami.result_gene_sets[algo1].intersection(cami.result_gene_sets[algo2]))) - - fig2, ax3 = plt.subplots(figsize=(20,20)) - - ax3 = sb.heatmap(comparison_matrix, annot=True, fmt='g') - ax3.set_title('Intersections of result_gene_sets of all analyzed algorithms.') - fig2.savefig(f'{output_dir}/heatmap_{cami.uid}.png', bbox_inches="tight") - print(f'saved intersection heatmap of all algorithms under: {output_dir}/heatmap_{cami.uid}.png') - plt.close(fig2) - - fig2a, ax3a = plt.subplots(figsize=(20,20)) - comparison_matrix_normalized = comparison_matrix.apply(lambda row: row/row.max(), axis=1) - ax3a = sb.heatmap(comparison_matrix_normalized, annot=True, fmt='.2f') - ax3a.set_title('Normalized intersections of result_gene_sets of all analyzed algorithms.') - fig2a.savefig(f'{output_dir}/heatmap_{cami.uid}_normalized.png', bbox_inches="tight") - print(f'saved intersection heatmap of all algorithms under: {output_dir}/heatmap_{cami.uid}_normalized.png') - plt.close(fig2a) - if nvenn and vis: - url = cami.use_nvenn() - cami.download_diagram(url) - - with open(f'{output_dir}/00_node_degrees.tsv', 'w') as node_degrees: - node_degrees.write('vertex\tout_degree\tin_degree\n') - for vertex in cami.ppi_graph.vertices(): - node_degrees.write(f'{cami.ppi_vertex2gene[vertex]}\t{vertex.out_degree()}\t{vertex.in_degree()}\n') - - # initialize cami and seed_var - base_seeds = cami.origin_seed_lst - original_seeds = [cami.ppi_vertex2gene[seed] for seed in base_seeds] - print(f'Initializing CAMI and the seed variation by running CAMI with all given seeds:{original_seeds}') - - with open(os.path.join(home_path, f'data/output/explorativeness.tsv'), 'a') as f: - predict_and_make_consensus(vis=True) - seedname = seeds - for tool in cami.result_gene_sets: - f.write(f'\n{seedname}\t{len(cami.seed_lst)}\t{tool}\t{len(cami.result_gene_sets[tool])}') - cami.make_evaluation() - #predict_and_make_consensus(vis=True) - - random.seed(50) - removal_frac = 0.2 - nof_iterations = int(seed_variation) - used_tools = list(cami.result_gene_sets.keys()) - nof_seeds = len(base_seeds) - nof_removals = max([int(nof_seeds * removal_frac), 1]) - variation_results = [] - result_file_1 = f'{output_dir}/00_seedvariation_rediscovered_seeds.tsv' - result_file_2 = f'{output_dir}/00_seedvariation_rediscovery_rates.tsv' - result_file_3 = f'{output_dir}/00_seedvariation_precision.tsv' - result_file_4 = f'{output_dir}/00_seedvariation_results4.tsv' - n_results = len(cami.result_gene_sets) - - redisc_intersection_matrix = pd.DataFrame([[0 for _ in range(n_results)] for __ in range(n_results)], - columns = list(cami.result_gene_sets.keys()), - index = list(cami.result_gene_sets.keys()), - dtype=int) - with open(result_file_1, 'w') as res_table1: - with open(result_file_2, 'w') as res_table2: - with open(result_file_3, 'w') as res_table3: - with open(result_file_4, 'w') as res_table4: - res_table1.write('id') - res_table2.write('id\tnof_removed_seeds\tremoved_seeds\t{used_seeds}\ttool\trediscovery_prevalence') - res_table2.write('\trediscovery_rate\trediscovered_seeds') - for tool in used_tools: - res_table1.write(f'\t{tool}') - res_table1.write('\n') - res_table2.write('\n') - res_table3.write('id\ttrue_positives\trediscoveries/module_size\n') - tp_rate_dict = {k:list() for k in used_tools} - module_size_dict = {k:list() for k in used_tools} - - for ident in range(nof_iterations): - res_table1.write(f'{ident}') - # update uid - new_identifier = identifier + f'_{ident}' - # reset cami - cami.reset_cami(new_uid=new_identifier) -# cami.ppi_graph = original_ppi - - #remove seeds (again) - print(f'Removing {nof_removals} seeds from the original seed list...') - removed_seeds_idx = random.sample(list(range(nof_seeds)), nof_removals) - removed_seeds = cami.remove_seeds(removed_seeds_idx) - rem_seeds = [cami.ppi_vertex2gene[seed] for seed in removed_seeds] - print(f'Removed: {rem_seeds} from the seed list') - print('Updating tools and repeat CAMI') - # reinitialize tools - cami.initialize_all_tools() - - # repeat consensus - if ident%20==0: - predict_and_make_consensus(vis=True) - else: - predict_and_make_consensus() - - used_seeds = [cami.ppi_vertex2gene[seed] for seed in cami.seed_lst] - - - result_dict = cami.result_gene_sets - redisc_rate_dict = {} - redisc_seeds_dict = {} - - for tool in result_dict: - nof_predictions = len(result_dict[tool]) + len(used_seeds) - redisc_seeds = set(result_dict[tool]).intersection(set(rem_seeds)) - redisc_prev = len(redisc_seeds) - redisc_rate = redisc_prev / nof_removals - redisc_rate_dict[tool] = redisc_rate #true positive rate: rate verhältnis von gefundenen und allgemein gefundenen - redisc_seeds_dict[tool] = redisc_seeds - tp_rate = redisc_prev / len(removed_seeds) - tp_rate_dict[tool].append(tp_rate) - module_size_frac = redisc_prev / nof_predictions - assert module_size_frac <= 1 - module_size_dict[tool].append(module_size_frac) - res_table3.write(f'{ident}\t{tool}\t{tp_rate}\t{module_size_frac}\n') - res_table1.write('\t') - for idx,seed in enumerate(redisc_seeds): - if idx == 0: - res_table1.write(f'{list(redisc_seeds)[0]}') - else: - res_table1.write(f',{seed}') - print(f'{tool} rediscovered {redisc_seeds} after removing {rem_seeds}.') - res_table2.write(f'{ident}\t{tool}\t{nof_removals},\t{rem_seeds}\t{used_seeds}\t{redisc_prev}\t{redisc_rate}\t{redisc_seeds}\n') - res_table1.write('\n') - variation_results.append((redisc_rate_dict, redisc_seeds_dict, used_seeds, rem_seeds)) - for algo1 in redisc_seeds_dict: - for algo2 in redisc_rate_dict: - redisc_intersection_matrix.loc[algo1,algo2] += len(redisc_seeds_dict[algo1].intersection(redisc_seeds_dict[algo2])) - print(f'Result tables are saved in the following locations:') - print(f'Rediscovered seeds: {result_file_1}') - print(f'Rediscovery Rates: {result_file_2}') - print(f'Sensitivity: {result_file_3}') - fig3, ax6 = plt.subplots(figsize=(20,20)) - ax6 = sb.heatmap(redisc_intersection_matrix, annot=True, fmt='g') - ax6.set_title('Number of times the algorithms rediscovered the same seeds') - fig3.savefig(f'{output_dir}/00_seedvariation_heatmap_{identifier}.png', bbox_inches="tight") - print(f'saved intersection heatmap of all algorithms under:{output_dir}/heatmap_{identifier}.png') - plt.close(fig3) - fig3a, ax6a = plt.subplots(figsize=(20,20)) - redisc_intersection_matrix_normalized = redisc_intersection_matrix.apply(lambda row: row/row.max(), axis=1) - ax6a = sb.heatmap(redisc_intersection_matrix_normalized, annot=True, fmt='.2f') - ax6a.set_title('Normalized numbers of times the algorithms rediscovered the same seeds') - fig3a.savefig(f'{output_dir}/00_seedvariation_heatmap_{identifier}_normalized.png', bbox_inches="tight") - print(f'saved intersection heatmap of all algorithms under:{output_dir}/heatmap_normalized_{cami.uid}.png') - plt.close(fig3a) - -# print(variation_results) - rediscovery_rates_results = [results[0] for results in variation_results] -# print(rediscovery_rates_results) - tools = [tool for tool in rediscovery_rates_results[0].keys()] - tool_labels = tools.copy() - redisc_rates = [[res[tool] for res in rediscovery_rates_results] for tool in tools] - for idx,tool in enumerate(tools): - if '_' in tool: - # find the index of the second occurrence of the character - second_occurrence_index = tool.find('_', tool.find('_') + 1) - if second_occurrence_index > -1: - # replace the character at that index with the replacement character - tool_name = tool[:second_occurrence_index] + '\n' + tool[second_occurrence_index + 1:] - tool_labels[idx] = tool_name - - #PLOT - # Create a figure instance - #print(sys.getrecursionlimit()) - fig1, (ax1, ax5, ax4) = plt.subplots(3, 1, figsize=(20,20)) - fig1.subplots_adjust(left=0.2) - # Extract Figure and Axes instance - - # Create a plot - violins1 = ax1.violinplot(redisc_rates, showmeans=True, showextrema=True) - - for violinpart in list(violins1.keys())[2:]: - violins1[violinpart].set_color('k') - - for violin in violins1['bodies']: - violin.set_facecolor('red') - # Add title - ax1.set_title(f'Rediscovery rate after randomly removing {nof_removals} seeds {nof_iterations} times from {identifier} seeds.', wrap=True, fontsize=14) - - ax1.set_xticks(list(range(1,len(tools)+1))) - ax1.set_xticklabels(tool_labels) - ax1.tick_params(axis='x', labelsize=11) - - ax1.set_ylabel('Rediscovery rate (<rediscovered seeds>/<removed seeds>)', wrap=True, fontsize=14) - - - violins2 = ax4.violinplot([tp_rate_dict[tool] for tool in tools], showmeans=True, showextrema=True) - for violinpart in list(violins2.keys())[2:]: - violins2[violinpart].set_color('k') - for violin in violins2['bodies']: - violin.set_facecolor('orange') - # Add title - ax4.set_title(f'True positive rates after randomly removing {nof_removals} seeds {nof_iterations} times from {identifier} seeds.', wrap=True, fontsize=14) - - ax4.set_xticks(list(range(1,len(tools)+1))) - ax4.set_xticklabels(tool_labels) - ax4.tick_params(axis='x', labelsize=11) - - ax4.set_ylabel('Sensitivity (TP/TP + FN)', wrap=True, fontsize=14) - - violins3 = ax5.violinplot([module_size_dict[tool] for tool in tools], showmeans=True, showextrema=True) - # Add title - for violinpart in list(violins3.keys())[2:]: - violins3[violinpart].set_color('k') - - for violin in violins3['bodies']: - violin.set_facecolor('blue') - ax5.set_title(f'Ratio of number of rediscovered seeds and predicted module size after removing {nof_removals} seeds {nof_iterations} times from {identifier} seeds.', wrap=True, fontsize=14) - - ax5.set_xticks(list(range(1,len(tools)+1))) - ax5.set_xticklabels(tool_labels) - - ax5.set_ylabel('precision (<rediscovered seeds>/<module size>)', fontsize=14) - ax5.tick_params(axis='x', labelsize=11) - fig1.tight_layout() - fig1.savefig(f'{output_dir}/00_{identifier}_seed_variation_result.png', bbox_inches="tight") - plt.close(fig1) - print(f'Violin plot saved under: 00_{identifier}_seed_variation_result.png') - + cami = sv.make_seedvariation(cami, seed_variation, removal_frac=0.2, vis=True) + + tmp_dir = cami.tmp_dir + if save_temps: print(f'All temporary files were kept in {tmp_dir}') print(f'You can remove them by typing: rm -rv {tmp_dir}') @@ -435,16 +118,20 @@ if __name__ == "__main__": "domino\ndiamond\nrobust\hotnet\nThe default tools are: diamond, domino and robust.") parser.add_argument('-ext', '--external_results', nargs='+', action='store', default=[], help="List of paths to results generated by external tools. The files should contain the tool name in the first line and then the predicted genes (1 gene per line)") + parser.add_argument('-ext_only', '--external_only', nargs='+', action='store', default=[], + help="List of paths to results generated by external tools. The files should contain the tool name in the first line and then the predicted genes (1 gene per line). Only the external results will be used for consensus prediction.") parser.add_argument('-w', '--tool_weights', nargs='+', action='store', default=None, help="List of weights for the tools. If you have [domino, diamond, robust] as list of tools and diamonds weight should be twice as high as the other tools type: 1 2 1") parser.add_argument('-c', '--consensus', action='store_true', help="run only the consensus prediction part of cami") parser.add_argument('-var', '--seed_variation', action='store', help="repeat consensus selection multiple times (please provide the number of iterations) while removing 20 percent of the seeds.") - parser.add_argument('-e', '--evaluate', action='store_true', help="run only the evaluation part of cami") - parser.add_argument('-o', '--output_dir', action='store', help="path to output directory") + parser.add_argument('-e', '--evaluate', action='store_true', help="evaluation using DIGEST") + parser.add_argument('-o', '--output_dir', action='store', help="path to output directory", default=None) parser.add_argument('-id', '--identifier', action='store', help="ID for the current excecution of cami. Defaults to a randomly generated ID") parser.add_argument('-tmp', '--save_temps', action='store_true', help="keep all temporary files") - parser.add_argument('-v', '--nvenn', action='store_true', help="Visualize results using nVenn by Degradome, an external webtool. Please note that degradome can only be used for visualization with up to 5 tools.") - parser.add_argument('-img', '--save_image', action='store_true', help="Save the venn diagram from the visualization as png. (Only possible for up to 5 tools)") + parser.add_argument('-v', '--visualize', action='store_true', help="Visualize the results with nVenn, comparison matrix and drugstone") + parser.add_argument('-comp', '--comparison_matrix', action='store_true', help="Visualize the results with a comparison matrix") + parser.add_argument('-nv', '--nvenn', action='store_true', help="Visualize results using nVenn by Degradome, an external webtool. Please note that nvenn can only be used for visualization with up to 5 tools.") + parser.add_argument('-img', '--save_image', action='store_true', help="Save the venn diagram and/or comparison matrix from the visualization functions as png.") parser.add_argument('-f', '--force', action='store_true', help="Ignore warnings and overwrite everything when excecuting CAMI.") parser.add_argument('-d', '--drugstone', nargs='*', action='store', default=None, help="Visualize the cami module via the drugstone API. If necessary the user needs to provide a list of the two titles of the two columns that contain the symbols of the gene edges in the inputfile of the PPI network. The order needs to correspond to the order of the first two columns. The default is 'Official_Symbol_Interactor_A Official_Symbol_Interactor_B'. Please note that the symbol columns cannot be the first two columns. If the two main edges in the first two columns are correspond also the gene symbols please duplicate these columns.") diff --git a/cami_src/cami_suite.py b/cami_src/cami_suite.py index 8e5a886..2c88390 100644 --- a/cami_src/cami_suite.py +++ b/cami_src/cami_suite.py @@ -1,6 +1,7 @@ import sys import threading, biodigest, os -from utils import drugstone, degradome, ncbi +import uuid +from utils import drugstone, degradome, ncbi, comparison_matrix from algorithms.DiamondWrapper import DiamondWrapper from algorithms.DominoWrapper import DominoWrapper from algorithms.RobustWrapper import RobustWrapper @@ -8,34 +9,6 @@ from configparser import ConfigParser import preprocess from consensus import cami_v1, cami_v2, cami_v3 - -def list_combinations(lst, k): - """creates all possible combinations of length k with two objects in a list - - :param lst: a list with length 2 - :type lst: list() - :param k: length of the combinations - :type k: int - """ - nof_combs = int(2 ** k) - l = int(nof_combs / 2) - columns = [] - while l >= 1: - column = [] - while len(column) < nof_combs: - for _ in range(l): - column.append(lst[0]) - for _ in range(l): - column.append(lst[1]) - columns.append(column) - l = l / 2 - if l >= 1: - l = int(l) - combs = [tuple([column[i] for column in columns]) for i in range(nof_combs)] - assert len(set(combs)) == nof_combs - return (combs) - - def initialize_cami(path_to_ppi_file=''): cami_params = {} # find homepath aka ~/cami @@ -91,8 +64,8 @@ class cami(): consensus approach """ - def __init__(self, ppi_graph, seed_lst, tool_wrappers, output_dir, uid, home_path, tmp_dir='', config='camiconf', - seed_score=10, parallelization=False): + def __init__(self, ppi_graph, seed_lst, tool_wrappers, home_path, initial_seed_lst, uid=None, output_dir='', configuration='camiconf', + parallelization=False, ncbi=False, debug=False,save_temps=False, toolweights=None): """Instance variables of CAMI :param ppi_graph: The PPI-Graph on which all predictions in CAMI are based of @@ -110,20 +83,33 @@ class cami(): :param home_path: Path to the cami home directory (gitlab repository) :type home_path: str """ - self.debug = False + self.debug = debug self.ppi_graph = ppi_graph self.origin_ppi_graph = ppi_graph.copy() self.ppi_vertex2gene = self.ppi_graph.vertex_properties["name"] self.ppi_gene2vertex = {self.ppi_vertex2gene[vertex]: vertex for vertex in self.ppi_graph.vertices()} - self.initial_seed_lst = None + self.initial_seed_lst = initial_seed_lst self.seed_lst = seed_lst self.origin_seed_lst = seed_lst.copy() self.tool_wrappers = tool_wrappers - self.output_dir = output_dir + self.toolweights = toolweights self.home_path = home_path + if uid==None: + uid = str(uuid.uuid4()) self.uid = str(uid) - if tmp_dir == '': - tmp_dir = os.path.join(home_path, 'data', 'tmp', self.uid) + if output_dir == None: + output_dir = os.path.join(home_path, 'data', 'output', self.uid) + if not os.path.exists(output_dir): + os.makedirs(output_dir) + if self.debug: + print(f"Output directory of cami: {output_dir}") + self.output_dir = output_dir + + tmp_dir = os.path.join(home_path, 'data', 'tmp', self.uid) + if not os.path.exists(tmp_dir): + os.makedirs(tmp_dir) + if debug: + print(f'Creating unique temporary directory for CAMI: {tmp_dir}') self.tmp_dir = tmp_dir self.nof_tools = len(tool_wrappers) @@ -132,16 +118,19 @@ class cami(): self.cami_module = [] # TODO: pick place where cami_module is set, which consensus approach should we use? self.code2toolname = {tool.code: tool.name for tool in self.tool_wrappers} self.code2toolname[0] = 'No tool' - self.ncbi = False + self.ncbi = ncbi config = ConfigParser() - config.read('camiconf') + config.read(configuration) self.seed_score = config.get('cami', 'seed_score') - self.config = config + self.config = configuration self.threaded = parallelization # set weights for seed genes in ppi_graph for seed in self.seed_lst: self.ppi_graph.vertex_properties["cami_score"][seed] = self.seed_score + # initialize tools + for tool in tool_wrappers: + self.initialize_tool(tool) def reset_cami(self, new_uid='', change_tmp=False): if not new_uid == '': @@ -169,7 +158,11 @@ class cami(): tool.set_homepath(self.home_path) tool.set_id(self.uid) tool.set_config(self.config) - + if self.toolweights is not None: + tool.set_weight(self.toolweights[tool.code - 1]) + else: + tool.set_weight() + def initialize_all_tools(self): for tool in self.tool_wrappers: self.initialize_tool(tool) @@ -193,24 +186,50 @@ class cami(): return preds def make_evaluation(self): + seed_gene_lst = [self.ppi_vertex2gene[seed] for seed in self.seed_lst] + ppi_graph_file = os.path.join(self.tmp_dir, f'ppi_graph_{self.uid}.graphml') + self.ppi_graph.save(ppi_graph_file) + biodigest.setup.main(setup_type="api") for result_set in self.result_module_sets: - validation_results = biodigest.single_validation.single_validation( + set_validation_results = biodigest.single_validation.single_validation( tar=set(self.result_module_sets[result_set]), tar_id='entrez', mode='set-set', distance='jaccard', ref=set(self.seed_lst), ref_id='entrez') - if validation_results['status'] == 'ok': - biodigest.single_validation.save_results(validation_results, f'{result_set}_{self.uid}', + + if set_validation_results['status'] == 'ok': + biodigest.single_validation.save_results(set_validation_results, f'{result_set}_{self.uid}', self.output_dir) - biodigest.evaluation.d_utils.plotting_utils.create_plots(results=validation_results, + biodigest.evaluation.d_utils.plotting_utils.create_plots(results=set_validation_results, mode='set-set', tar=set(self.result_module_sets[result_set]), tar_id='entrez', out_dir=self.output_dir, - prefix=f'{result_set}_{self.uid}') + prefix=f'{result_set}_{self.uid}', + file_type='png') + + sub_validation_results = biodigest.single_validation.single_validation( + tar=set(self.result_module_sets[result_set]), + tar_id='entrez', + mode='subnetwork-set', + distance='jaccard', + network_data={"network_file":ppi_graph_file, + "prop_name":"name", + "id_type":"entrez"} + ) + if sub_validation_results['status'] == 'ok': + biodigest.single_validation.save_results(sub_validation_results, f'{result_set}_{self.uid}', + self.output_dir) + biodigest.evaluation.d_utils.plotting_utils.create_plots(results=sub_validation_results, + mode='subnetwork-set', + tar=set(self.result_module_sets[result_set]), + tar_id='entrez', + out_dir=self.output_dir, + prefix=f'{result_set}_{self.uid}', + file_type='png') def run_threaded_tool(self, tool, pred_sets): """run a tool in one thread and save the results into a dictionary pred_sets @@ -245,7 +264,7 @@ class cami(): for tool in self.tool_wrappers: pred_sets[tool] = self.run_tool(tool) - assert (list(pred_sets.values()).count(None) < 1) + assert (list(pred_sets.values()).count(None) < 1), "One of the tools did not return a result set" result_sets = {tool: set([self.ppi_graph.vertex(idx) for idx in pred_sets[tool]]) for tool in pred_sets} return result_sets @@ -304,6 +323,9 @@ class cami(): camis = { 'cami_v1': {'function': cami_v1.run_cami, 'params': {'consens_threshold': consens_threshold}}, + 'union': {'function': cami_v1.make_union, 'params': {}}, + 'intersection': {'function': cami_v1.make_intersection, 'params': {}}, + 'first_neighbors': {'function': cami_v1.make_first_neighbor_result_set, 'params': {}}, 'cami_v2_param1_tr': {'function': cami_v2.run_cami, 'params': { 'hub_penalty': 0.3, 'damping_factor': 0.7, 'confidence_level': 0.5 }}, @@ -390,6 +412,7 @@ class cami(): print( f'With the {len(seed_genes)} seed genes the module predicted by {toolname} contains {len(self.result_module_sets[toolname])} genes') + def generate_output(self, cami_method, seed_genes, cami_vlist, cami_vertices, putative_vertices, cami_genes, gene_name_map, codes2tools, cami_scores): # save all predictions by all tools @@ -426,50 +449,52 @@ class cami(): outputfile.write( f'{gene_name_map[vertex]}\t{str(vertex)}\t{cami_scores[vertex]}\t{vertex.out_degree()}{url}{summary}\n') - # # save the whole module - # whole_module = [] - # with open(f'{self.output_dir}/{cami_method}_module_{self.uid}.txt', 'w') as modfile: - # for vertex in seed_genes: - # modfile.write(f'{vertex}\n') - # whole_module.append(vertex) - # for vertex in cami_genes: - # modfile.write(f'{vertex}\n') - # whole_module.append(vertex) - - # print(f'saved {cami_method} output in: {cami_method}_output_{self.uid}.tsv') - # print(f'saved the Consensus Active Module by CAMI in: {self.output_dir}/{cami_method}_module_{self.uid}.txt') - # save predicted modules by all other tools for tool in self.result_module_sets: - with open(f'{self.output_dir}/{tool}_output_{self.uid}.tsv', 'w') as outputfile: + with open(f'{self.output_dir}/{tool}_output_module_{self.uid}.tsv', 'w') as outputfile: outputfile.write('gene\n') for gene in self.result_gene_sets[tool]: outputfile.write(f'{gene}\n') if self.debug: print(f'saved {tool} output in: {self.output_dir}/{tool}_output_{self.uid}.tsv') - - def use_nvenn(self): - """Create Venn Diagrams via a external tool named degradome. + + def visualize_and_save_comparison_matrix(self, additional_id='', + title='Intersections of result_gene_sets of all analyzed algorithms.'): + """Create a comparison matrix of the results of all tools. And save it as png file. + """ + identifier = f'{self.uid}_{additional_id}' if additional_id else self.uid + comp_matrix = comparison_matrix.make_comparison_matrix(self.result_gene_sets) + comp_fig, comp_ax, norm_fig, norm_ax = comparison_matrix.plot_comparison_matrix(comp_matrix, + title=title, + n_rows=self.nof_tools,) + comp_fig_file = f'{self.output_dir}/comparison_matrix_{identifier}.png' + comp_fig.savefig(comp_fig_file, bbox_inches="tight") + if self.debug: + print(f'saved comparison matrix in: {comp_fig_file}') + norm_fig_file = f'{self.output_dir}/normalized_comparison_matrix_{identifier}.png' + if self.debug: + print(f'saved normalized comparison matrix in: {norm_fig_file}') + norm_fig.savefig(norm_fig_file, bbox_inches="tight") + comp_fig.close() + norm_fig.close() + return comp_fig_file, norm_fig_file + + def use_nvenn(self, download=False): + """Create Venn Diagrams via a external tool named nvenn by degradome. Sends a request via requests to the degradome server. Returns the URL of the result. """ # visualize with degradome if self.nof_tools < 7: - print('Visualizing results using Degradome...') degradome_sets = {tool: self.result_gene_sets[tool] for tool in self.result_gene_sets if len(self.result_gene_sets[tool]) > 0} url = degradome.send_request(degradome_sets) with open(f'{self.output_dir}/venn_link_{self.uid}.txt', 'w') as f: f.write(url) + if download: + self.download_diagram(url) return url - - # elif nof_tools == 6: - # print('Visualizing using Degradome...(seeds excluded from results)') - # # degradome_sets = result_sets.copy() - # # degradome_sets['CAMI'] = set(result_genes) - # url = degradome.send_request(degradome_sets) - # webbrowser.open(url) else: print('Cannot use degradome to create venn diagrams of 6 or more tools') return None diff --git a/cami_src/camiconf b/cami_src/camiconf index cc6f0b0..9a695fa 100644 --- a/cami_src/camiconf +++ b/cami_src/camiconf @@ -12,20 +12,27 @@ visualization_flag = False output_name = 'modules.out' para = 1 c = 'false' +toolweight: 1 [diamond] + alpha : 1 pred_factor : 3 max_preds : 200 p_value_cutoff : 1e-05 +toolweight : 1 [robust] initial_fraction = 0.25 reduction_factor = 0.9 number_steiner_trees = 30 threshold = 0.1 +toolweight : 1 [cami] seed_score = 10.0 +toolweight : 1 +[custom_algorithm] +toolweight : 1 # vim: set fdm=marker: diff --git a/cami_src/consensus/cami_v1.py b/cami_src/consensus/cami_v1.py index b65e655..e76c39e 100644 --- a/cami_src/consensus/cami_v1.py +++ b/cami_src/consensus/cami_v1.py @@ -45,4 +45,40 @@ def run_cami(result_sets, ppi_graph, seed_lst, predicted_by, cami_scores, code2t codes2tools = {vertex: [code2toolname[idx] for idx, code in enumerate(predicted_by[vertex]) if code == 1] for vertex in ppi_graph.vertices()} - return cami_vertices, putative_vertices, codes2tools \ No newline at end of file + return cami_vertices, putative_vertices, codes2tools + +def make_union(result_sets, ppi_graph, seed_list, + predicted_by, cami_scores, + code2toolname, tool_code, params): + codes2tools = {vertex: [code2toolname[idx] for idx, code in enumerate(predicted_by[vertex]) if code == 1] for + vertex in ppi_graph.vertices()} + putative_vertices = set() + union = set() + for tool in result_sets: + for vertex in result_sets[tool]: + putative_vertices.add(vertex) + union.add(vertex) + return union, putative_vertices, codes2tools + +def make_intersection(result_sets, ppi_graph, seed_list, + predicted_by, cami_scores, + code2toolname, tool_code, params): + codes2tools = {vertex: [code2toolname[idx] for idx, code in enumerate(predicted_by[vertex]) if code == 1] for + vertex in ppi_graph.vertices()} + putative_vertices = set() + intersect = set(list(result_sets.values())[0]) + for tool in result_sets: + intersect = intersect.intersection(result_sets[tool]) + return intersect, putative_vertices, codes2tools + +def make_first_neighbor_result_set(result_sets, ppi_graph, seed_list, + predicted_by, cami_scores, + code2toolname, tool_code, params): + codes2tools = {vertex: [code2toolname[idx] for idx, code in enumerate(predicted_by[vertex]) if code == 1] for + vertex in ppi_graph.vertices()} + putative_vertices = set() + first_neighbor = set() + for seed in seed_list: + for neighbor in seed.all_neighbors(): + first_neighbor.add(neighbor) + return first_neighbor, putative_vertices, codes2tools \ No newline at end of file diff --git a/cami_src/evaluation/__init__.py b/cami_src/evaluation/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/cami_src/preprocess.py b/cami_src/preprocess.py index 9512aff..406bf9d 100644 --- a/cami_src/preprocess.py +++ b/cami_src/preprocess.py @@ -1,5 +1,115 @@ +import os +import uuid +import sys import graph_tool -import graph_tool.centrality +from configparser import ConfigParser +from algorithms.DiamondWrapper import DiamondWrapper +from algorithms.DominoWrapper import DominoWrapper +from algorithms.RobustWrapper import RobustWrapper +from algorithms.HHotNetWrapper import HHotNetWrapper +from algorithms.CustomWrapper import CustomWrapper + + +def all(ppi_file, seed_file, + config='camiconf', + cami_tools=['diamond', 'domino', 'robust'], + ext_results=[], + symbol_column=None, delimiter="\t"): + """executes all preprocessing steps, returns a ppi graph, a list of seeds in the network, the initial + list of seeds and a list of initialized tools + :param ppi_file: path to the ppi file + :type ppi_file: str + :param seed_file: path to the seed file + :type seed_file: str + :param tools: list of tools to be executed by CAMI, defaults to ['diamond', 'domino', 'robust'] + :type tools: list, optional + :param ext_results: list paths to external results, defaults to [] + :type ext_results: list, optional + :param symbol_column: column name of the symbol column, defaults to None + :type symbol_column: str, optional + :param delimiter: delimiter in input csv, defaults to "\t" + :type delimiter: str, optional + :return: Graph() object, list of seeds in network, initial list of seeds, list of initialized tools + :rtype: Graph(), list, list, list + """ + g = csv2graph(ppi_file, symbol_column, delimiter) + seeds = txt2lst(seed_file) + checked_seeds = update_seed_list(seeds, g) + cami_wrappers = initialize_cami_tools(cami_tools,config) + ext_wrappers = initialize_external_tools(ext_results,config,len(cami_tools)) + all_wrappers = cami_wrappers + ext_wrappers + return g, checked_seeds, seeds, all_wrappers + +def create_output_directories(output_dir, tmp_dir, identifier, path_to_data=None): + """creates the output directories for the results and the temporary files + + :param identifier: identifier for the output directory + :type identifier: str + :param output_dir: path to the output directory + :type output_dir: str + :param tmp_dir: path to the temporary directory + :type tmp_dir: str + """ + if identifier == None and output_dir == None and tmp_dir == None and path_to_data == None: + print('Please specify an output directory or an identifier and a path to where the results should be stored.') + if identifier == None: + identifier = str(uuid.uuid4()) + + if path_to_data == None: + path_to_data = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../data') + + if output_dir==None: + output_dir = os.path.join(path_to_data, f'{identifier}') + output_dir = os.path.abspath(output_dir) + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + if tmp_dir == None: + tmp_dir = os.path.join(path_to_data, f'tmp/{identifier}') + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + return output_dir, tmp_dir + +def initialize_external_tools(ext_tool_files,config,n_cami_tools): + ext_wrappers = [] + if len(ext_tool_files) > 0: + for idx in range(len(ext_tool_files)): + dummy_wrapper = CustomWrapper(config) + dummy_wrapper.code = idx + 1 + n_cami_tools + dummy_wrapper.name = f'ext_tool_{idx}' + ext_wrappers.append(dummy_wrapper) + return ext_wrappers + +def initialize_cami_tools(tools, config='camiconf'): + """initializes the that should be executed by CAMI, returns a list of CAMI AlgorithmWrappers + + :param tools: list of tools + :type tools: list + :return: list of initialized tools + :rtype: list + """ + wrappers = {'diamond': DiamondWrapper(config), + 'domino': DominoWrapper(config), + 'robust': RobustWrapper(config), + 'hotnet': HHotNetWrapper(config) + } + tool_wrappers = [wrappers[tool] for tool in tools] + return tool_wrappers + +def update_seed_list(seeds, g): + """returns an updated version (copy) of the seed list to only contain genes that are in the graph + also renames the checked seeds to their index of the gene in the graph + :param seeds: list of seeds + :type seeds: list + :param g: graph_tool object Graph() + :type g: Graph() + :return: updated list of seeds + :rtype: list + """ + name2index = name2index_dict(g) + updated_seeds = [name2index[seed] for seed in seeds if seed in name2index] + return updated_seeds def csv2graph(inputfile, symbol_columns, @@ -15,7 +125,7 @@ def csv2graph(inputfile, """ print('Creating the PPI network graph and seed list...') g = graph_tool.load_graph_from_csv(inputfile, skip_first=True, - csv_options={'delimiter': '\t', 'quotechar': '"'}) + csv_options={'delimiter':delimiter, 'quotechar': '"'}) if symbol_columns is not None: if len(symbol_columns) == 0: @@ -78,18 +188,4 @@ def name2index_dict(g): """ index2name = g.vertex_properties["name"] return {index2name[v]:v for v in g.vertices()} - -# TODO: review this function, where is it used? -def compare_seeds_and_network(ppi_network, seeds): - seeds_in_network = [] - seeds_not_in_network = [] - for seed in seeds: - if len(seed) > 0: - seeds_in_network.append(seed) - else: - seeds_not_in_network.append(seed) - if len(seeds_not_in_network) > 0: - print(f'Warning: {len(seeds_not_in_network)} of {len(seeds)} seeds are not in the input PPI-network: {seeds_not_in_network}') - return seeds_in_network - return False diff --git a/cami_src/seed_variationconf b/cami_src/seed_variationconf index 665380c..8475a43 100644 --- a/cami_src/seed_variationconf +++ b/cami_src/seed_variationconf @@ -12,20 +12,23 @@ visualization_flag = False output_name = 'modules.out' para = 1 c = 'false' +toolweight : 1 [diamond] alpha : 1 pred_factor : 3 max_preds : 200 p_value_cutoff : 1e-05 +toolweight : 1 [robust] initial_fraction = 0.25 reduction_factor = 0.9 number_steiner_trees = 30 threshold = 0.1 +toolweight : 1 [cami] seed_score = 10.0 - +toolweight : 1 # vim: set fdm=marker: \ No newline at end of file -- GitLab