Skip to content
Snippets Groups Projects
Select Git revision
  • 62686b99a3e13aadf6f7f73baf13dea9c4e09412
  • main default protected
  • sumlab
  • dev/test_tobias
  • jack.rolph-main-patch-16563
  • jack.rolph-main-patch-96201
  • jack.rolph-main-patch-18340
  • jack.rolph-main-patch-15793
  • jack.rolph-main-patch-74592
  • 1.0.0
10 results

PeakOTron.py

Blame
  • Code owners
    Assign users and groups as approvers for specific file changes. Learn more.
    cami.py 25.08 KiB
    #!/usr/bin/env python3
    import os
    import sys
    import shutil
    from algorithms.DiamondWrapper import DiamondWrapper
    from algorithms.DominoWrapper import DominoWrapper
    from algorithms.RobustWrapper import RobustWrapper
    from algorithms.HHotNetWrapper import HHotNetWrapper
    from algorithms.AlgorithmWrapper import AlgorithmWrapper
    import preprocess
    import uuid
    import cami_suite
    import argparse
    import webbrowser
    import random
    import matplotlib.pyplot as plt
    import seaborn as sb
    import pandas as pd
    
    #MC:
    from configparser import ConfigParser
    
    
    def main(ppi_network, seeds, tools, tool_weights, consensus, evaluate,
             output_dir, identifier, save_temps, nvenn, save_image, force, drugstone, ncbi, configuration, 
             seed_variation, parallelization, external_results, debug):
        print('CAMI started')
        config = ConfigParser()
        config.read(configuration)
        seed_score = config.get('cami', 'seed_score')
        nof_tools = len(tools) + len(external_results)
        
        wrappers = {'diamond': DiamondWrapper(),
                    'domino': DominoWrapper(),
                    'robust': RobustWrapper(),
                    'hotnet': HHotNetWrapper()
                    }
    
        tool_wrappers = [wrappers[tool] for tool in tools]
        nof_cami_tools = len(wrappers)
        
        if tool_weights == None:
            tool_weights = nof_tools * [1]
        
        ext_wrappers = []
        if len(external_results) > 0:
            for idx in range(len(external_results)):
                dummy_wrapper = AlgorithmWrapper()
                dummy_wrapper.code = idx + 1 + nof_cami_tools
                ext_wrappers.append(dummy_wrapper)
            
        ppi_graph = preprocess.csv2graph(ppi_network, symbol_columns=drugstone, nof_tools=nof_tools)
    
        seed_lst = preprocess.txt2lst(seeds)
        initial_seedlst = seed_lst.copy()
    
        # rename seeds with their corresponding indices in the ppi_graph
        name2index = preprocess.name2index_dict(ppi_graph)
        checked_seed_lst = [name2index[v] for v in seed_lst if v in name2index]
        if debug:
            print('Created the PPI-network Graph and seed list.')
        n = len(checked_seed_lst)
        m = len(seed_lst)
        if n != m  and not force:
            name = ppi_graph.vertex_properties["name"]
            print(f'There are {abs(m-n)} seeds missing in the PPI network: ({set([name[cseed] for cseed in checked_seed_lst]).symmetric_difference(set(initial_seedlst))})')
            choice = input('Continue? [y/n]')
            if choice == 'y':
                pass
            else:
                print('Aborted because of missing seeds. Please try again.')
                exit(1)
    
        seed_lst = checked_seed_lst.copy()
        if debug: print(f'Continuing with vertices at indices {[int(seed) for seed in seed_lst]} as input seeds')
    
        # change directory to ~/cami/cami (home of cami.py)
        cami_home = sys.argv[0].rsplit('/', 1)
        os.chdir(cami_home[0])
        home_path = os.path.dirname(os.getcwd())
        if debug:
            print(f"Home directory of cami: {home_path}")
        if identifier==None:
            identifier = str(uuid.uuid4())
    
        if output_dir==None:
            output_dir = os.path.join(home_path, f'data/output/{identifier}')
            output_dir = os.path.abspath(output_dir)
            if debug:
                print(f"Output directory of cami: {output_dir}")
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
    
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
    
        #create temporary directory
        tmp_dir = os.path.join(home_path, f'data/tmp/{identifier}')
        if debug:
            print(f'Creating unique temporary directory for CAMI: {tmp_dir}')
    
        if not os.path.exists(tmp_dir):
            os.makedirs(tmp_dir)
            
        elif not force:
            print(f'TemporaryDirectory {tmp_dir} already exists.')
            choice = input('overwrite? [y/n]')
            if choice == 'y':
                pass
                # shutil.rmtree(tmp_dir)
                # os.makedirs(tmp_dir)
            else:
                print('Abort. Please try again.')
                exit(1)
    
        # initialize CAMI
        cami = cami_suite.cami(ppi_graph,seed_lst,tool_wrappers,output_dir,identifier,home_path,
                               tmp_dir=tmp_dir, config=configuration, seed_score=seed_score, 
                               parallelization=parallelization)
    
        if ncbi:
            cami.ncbi = True
        
        if debug:
            cami.debug = True
            
        for tool in tool_wrappers:
            cami.initialize_tool(tool)
        
        for tool in ext_wrappers:
            cami.initialize_tool(tool)
        
        for idx, tool in enumerate(tool_wrappers):
            tool.set_weight(float(tool_weights[idx]))
    
        cami.set_initial_seed_lst(initial_seedlst)
    
        if (consensus or (not consensus and not evaluate and not seed_variation)) and not seed_variation:
            result_sets = cami.make_predictions()
            if len(external_results) > 0:
                assert(len(ext_wrappers) == len(external_results))
                external_input = {}
                for i in range(len(ext_wrappers)):
                    external_input[ext_wrappers[i]] = external_results[i]
                result_sets = cami.take_custom_results(external_input, result_sets)
            cami.create_consensus(result_sets)
    
            # adds the seeds to the results, right now result_sets contains the seeds that should be ADDED to the module
            # for result in result_sets.keys():
            #     result_sets[result] = result_sets[result].union(cami.seed_lst)
    
            if nvenn or save_image:
                print('Sending results to nVenn')
                url = cami.use_nvenn()
                if url:
                    if nvenn:
                        webbrowser.open(url)
                    if save_image:
                        cami.download_diagram(url)
    
            if drugstone is not None:
                print('Sending results to DrugstOne')
                cami.use_drugstone()
            if consensus:
                cami.reset_cami()
    
        if evaluate or (not consensus and not evaluate and not seed_variation):
            cami.make_evaluation()
            if evaluate:
                cami.reset_cami()
        
        elif (not consensus and not evaluate):
            cami.reset_cami()
            
        # SEED VARIATION
        if seed_variation:
            
            def predict_and_make_consensus(vis=False):
                result_sets = cami.make_predictions()
                cami.create_consensus(result_sets)
                if vis:
                    n_results = len(cami.result_gene_sets)
                    comparison_matrix = pd.DataFrame([[int(0) for _ in range(n_results)] for __ in range(n_results)],
                                                     columns = list(cami.result_gene_sets.keys()),
                                                     index = list(cami.result_gene_sets.keys()),
                                                     dtype=int)
                    for algo1 in cami.result_gene_sets:
                        for algo2 in cami.result_gene_sets:
                            comparison_matrix.loc[algo1,algo2] = int(len(cami.result_gene_sets[algo1].intersection(cami.result_gene_sets[algo2])))
                            
                    fig2, ax3 = plt.subplots(figsize=(20,20))      
                                  
                    ax3 = sb.heatmap(comparison_matrix, annot=True, fmt='g')
                    ax3.set_title('Intersections of result_gene_sets of all analyzed algorithms.')
                    fig2.savefig(f'{output_dir}/heatmap_{cami.uid}.png', bbox_inches="tight")
                    print(f'saved intersection heatmap of all algorithms under: {output_dir}/heatmap_{cami.uid}.png')
                    plt.close(fig2)
                    
                    fig2a, ax3a = plt.subplots(figsize=(20,20))
                    comparison_matrix_normalized = comparison_matrix.apply(lambda row: row/row.max(), axis=1)
                    ax3a = sb.heatmap(comparison_matrix_normalized, annot=True, fmt='.2f')
                    ax3a.set_title('Normalized intersections of result_gene_sets of all analyzed algorithms.')
                    fig2a.savefig(f'{output_dir}/heatmap_{cami.uid}_normalized.png', bbox_inches="tight")
                    print(f'saved intersection heatmap of all algorithms under: {output_dir}/heatmap_{cami.uid}_normalized.png')
                    plt.close(fig2a)
                if nvenn and vis:
                    url = cami.use_nvenn()
                    cami.download_diagram(url)
                    
            with open(f'{output_dir}/00_node_degrees.tsv', 'w') as node_degrees:
                node_degrees.write('vertex\tout_degree\tin_degree\n')
                for vertex in cami.ppi_graph.vertices():
                    node_degrees.write(f'{cami.ppi_vertex2gene[vertex]}\t{vertex.out_degree()}\t{vertex.in_degree()}\n')   
            
            # initialize cami and seed_var
            base_seeds = cami.origin_seed_lst
            original_seeds = [cami.ppi_vertex2gene[seed] for seed in base_seeds]
            print(f'Initializing CAMI and the seed variation by running CAMI with all given seeds:{original_seeds}')
            
            with open(os.path.join(home_path, f'data/output/explorativeness.tsv'), 'a') as f:
                predict_and_make_consensus(vis=True)
                seedname = seeds
                for tool in cami.result_gene_sets:
                    f.write(f'\n{seedname}\t{len(cami.seed_lst)}\t{tool}\t{len(cami.result_gene_sets[tool])}')
            cami.make_evaluation()
            #predict_and_make_consensus(vis=True)
            
            random.seed(50)
            removal_frac = 0.2
            nof_iterations = int(seed_variation)
            used_tools = list(cami.result_gene_sets.keys())
            nof_seeds = len(base_seeds)
            nof_removals = max([int(nof_seeds * removal_frac), 1])
            variation_results = []
            result_file_1 = f'{output_dir}/00_seedvariation_results1.tsv'
            result_file_2 = f'{output_dir}/00_seedvariation_results2.tsv'
            result_file_3 = f'{output_dir}/00_seedvariation_results3.tsv'
            result_file_4 = f'{output_dir}/00_seedvariation_results4.tsv'
            n_results = len(cami.result_gene_sets)
            
            redisc_intersection_matrix = pd.DataFrame([[0 for _ in range(n_results)] for __ in range(n_results)],
                                                        columns = list(cami.result_gene_sets.keys()),
                                                        index = list(cami.result_gene_sets.keys()),
                                                        dtype=int)
            with open(result_file_1, 'w') as res_table1:
                with open(result_file_2, 'w') as res_table2:
                    with open(result_file_3, 'w') as res_table3:
                        with open(result_file_4, 'w') as res_table4:
                            res_table1.write('id')
                            res_table2.write('id\tnof_removed_seeds\tremoved_seeds\t{used_seeds}\ttool\trediscovery_prevalence')
                            res_table2.write('\trediscovery_rate\trediscovered_seeds')
                            for tool in used_tools:
                                res_table1.write(f'\t{tool}')                    
                            res_table1.write('\n')
                            res_table2.write('\n')
                            res_table3.write('id\ttrue_positives\trediscoveries/module_size\n')
                            tp_rate_dict = {k:list() for k in used_tools}
                            module_size_dict = {k:list() for k in used_tools}
                            
                            for ident in range(nof_iterations):
                                res_table1.write(f'{ident}')
                                # update uid
                                new_identifier = identifier + f'_{ident}'
                                # reset cami
                                cami.reset_cami(new_uid=new_identifier)
    #                            cami.ppi_graph = original_ppi
                                
                                #remove seeds (again)
                                print(f'Removing {nof_removals} seeds from the original seed list...')
                                removed_seeds_idx = random.sample(list(range(nof_seeds)), nof_removals)
                                removed_seeds = cami.remove_seeds(removed_seeds_idx)
                                rem_seeds = [cami.ppi_vertex2gene[seed] for seed in removed_seeds]
                                print(f'Removed: {rem_seeds} from the seed list')
                                print('Updating tools and repeat CAMI')
                                # reinitialize tools
                                cami.initialize_all_tools()
                                
                                # repeat consensus
                                if ident%20==0:     
                                    predict_and_make_consensus(vis=True)
                                else:
                                    predict_and_make_consensus()
                                
                                used_seeds = [cami.ppi_vertex2gene[seed] for seed in cami.seed_lst]
                                
                                
                                result_dict = cami.result_gene_sets
                                redisc_rate_dict = {}
                                redisc_seeds_dict = {}
                                
                                for tool in result_dict:
                                    nof_predictions = len(result_dict[tool]) + len(used_seeds)
                                    redisc_seeds = set(result_dict[tool]).intersection(set(rem_seeds))
                                    redisc_prev = len(redisc_seeds)
                                    redisc_rate = redisc_prev / nof_removals
                                    redisc_rate_dict[tool] = redisc_rate #true positive rate: rate verhältnis von gefundenen und allgemein gefundenen
                                    redisc_seeds_dict[tool] = redisc_seeds
                                    tp_rate = redisc_prev / len(removed_seeds)
                                    tp_rate_dict[tool].append(tp_rate)
                                    module_size_frac = redisc_prev / nof_predictions
                                    assert module_size_frac <= 1
                                    module_size_dict[tool].append(module_size_frac)
                                    res_table3.write(f'{ident}\t{tool}\t{tp_rate}\t{module_size_frac}\n')
                                    res_table1.write('\t')
                                    for idx,seed in enumerate(redisc_seeds):
                                        if idx == 0:
                                            res_table1.write(f'{list(redisc_seeds)[0]}')
                                        else:
                                            res_table1.write(f',{seed}') 
                                    print(f'{tool} rediscovered {redisc_seeds} after removing {rem_seeds}.')
                                    res_table2.write(f'{ident}\t{tool}\t{nof_removals},\t{rem_seeds}\t{used_seeds}\t{redisc_prev}\t{redisc_rate}\t{redisc_seeds}\n')
                                res_table1.write('\n')
                                variation_results.append((redisc_rate_dict, redisc_seeds_dict, used_seeds, rem_seeds))
                                for algo1 in redisc_seeds_dict:
                                    for algo2 in redisc_rate_dict:
                                        redisc_intersection_matrix.loc[algo1,algo2] += len(redisc_seeds_dict[algo1].intersection(redisc_seeds_dict[algo2]))
            print(f'Result tables are saved in the following locations:')
            print(f'Rediscovered seeds: {result_file_1}')
            print(f'Rediscovery Rates: {result_file_2}')
            print(f'Sensitivity: {result_file_3}')
            fig3, ax6 = plt.subplots(figsize=(20,20))                    
            ax6 = sb.heatmap(redisc_intersection_matrix, annot=True, fmt='g')
            ax6.set_title('Number of times the algorithms rediscovered the same seeds')
            fig3.savefig(f'{output_dir}/00_seedvariation_heatmap_{identifier}.png', bbox_inches="tight")
            print(f'saved intersection heatmap of all algorithms under:{output_dir}/heatmap_{identifier}.png')
            plt.close(fig3)
            fig3a, ax6a = plt.subplots(figsize=(20,20))
            redisc_intersection_matrix_normalized = redisc_intersection_matrix.apply(lambda row: row/row.max(), axis=1)
            ax6a = sb.heatmap(redisc_intersection_matrix_normalized, annot=True, fmt='.2f')
            ax6a.set_title('Normalized numbers of times the algorithms rediscovered the same seeds')
            fig3a.savefig(f'{output_dir}/00_seedvariation_heatmap_{identifier}_normalized.png', bbox_inches="tight")
            print(f'saved intersection heatmap of all algorithms under:{output_dir}/heatmap_normalized_{cami.uid}.png')
            plt.close(fig3a)
            
    #        print(variation_results)
            rediscovery_rates_results = [results[0] for results in variation_results]
    #        print(rediscovery_rates_results)
            tools = [tool for tool in rediscovery_rates_results[0].keys()]
            tool_labels = tools.copy()
            redisc_rates = [[res[tool] for res in rediscovery_rates_results] for tool in tools]
            for idx,tool in enumerate(tools):
                if '_' in tool:
                    # find the index of the second occurrence of the character
                    second_occurrence_index = tool.find('_', tool.find('_') + 1)
                    if second_occurrence_index > -1:
                        # replace the character at that index with the replacement character
                        tool_name = tool[:second_occurrence_index] + '\n' + tool[second_occurrence_index + 1:]
                        tool_labels[idx] = tool_name        
    
            #PLOT
            # Create a figure instance
            #print(sys.getrecursionlimit())
            fig1, (ax1, ax5, ax4) = plt.subplots(3, 1, figsize=(20,20))
            fig1.subplots_adjust(left=0.2)
            # Extract Figure and Axes instance
    
            # Create a plot
            violins1 = ax1.violinplot(redisc_rates, showmeans=True, showextrema=True)
             
            for violinpart in list(violins1.keys())[2:]:
                violins1[violinpart].set_color('k')
                
            for violin in violins1['bodies']:
                violin.set_facecolor('red')
            # Add title
            ax1.set_title(f'Rediscovery rate after randomly removing {nof_removals} seeds {nof_iterations} times from {identifier} seeds.', wrap=True, fontsize=14)
    
            ax1.set_xticks(list(range(1,len(tools)+1)))
            ax1.set_xticklabels(tool_labels)
            ax1.tick_params(axis='x', labelsize=11)
    
            ax1.set_ylabel('Rediscovery rate (<rediscovered seeds>/<removed seeds>)', wrap=True, fontsize=14)
    
            
            violins2 = ax4.violinplot([tp_rate_dict[tool] for tool in tools], showmeans=True, showextrema=True)
            for violinpart in list(violins2.keys())[2:]:
                violins2[violinpart].set_color('k')
            for violin in violins2['bodies']:
                violin.set_facecolor('orange')
            # Add title
            ax4.set_title(f'True positive rates after randomly removing {nof_removals} seeds {nof_iterations} times from {identifier} seeds.', wrap=True, fontsize=14)
    
            ax4.set_xticks(list(range(1,len(tools)+1)))
            ax4.set_xticklabels(tool_labels)
            ax4.tick_params(axis='x', labelsize=11)
    
            ax4.set_ylabel('Sensitivity (TP/TP + FN)', wrap=True, fontsize=14)
    
            violins3 = ax5.violinplot([module_size_dict[tool] for tool in tools], showmeans=True, showextrema=True)
            # Add title
            for violinpart in list(violins3.keys())[2:]:
                violins3[violinpart].set_color('k')
                
            for violin in violins3['bodies']:
                violin.set_facecolor('blue')
            ax5.set_title(f'Ratio of number of rediscovered seeds and predicted module size after removing {nof_removals} seeds {nof_iterations} times from {identifier} seeds.', wrap=True, fontsize=14)
    
            ax5.set_xticks(list(range(1,len(tools)+1)))
            ax5.set_xticklabels(tool_labels)
            
            ax5.set_ylabel('precision (<rediscovered seeds>/<module size>)', fontsize=14)
            ax5.tick_params(axis='x', labelsize=11)
            fig1.tight_layout()
            fig1.savefig(f'{output_dir}/00_{identifier}_seed_variation_result.png', bbox_inches="tight")
            plt.close(fig1)
            print(f'Violin plot saved under: 00_{identifier}_seed_variation_result.png')
            
        if save_temps:
            print(f'All temporary files were kept in {tmp_dir}')
            print(f'You can remove them by typing: rm -rv {tmp_dir}')
            print('Please note that CAMI internally works with the indices of the' +
                   ' vertices in the PPI-Network. This means that the names in the'+
                   ' inputfiles of the tools do not correspond to the gene names ' +
                   'but to their indices in the graph_tool Graph that CAMI creates.'
                   )
        else:
            print(f'Removing all temporary files in {tmp_dir}...')
            shutil.rmtree(tmp_dir)
            print('Done.')
    
    
    if __name__ == "__main__":
        parser = argparse.ArgumentParser(description="CAMI has two functions:\n" +\
            "Consensus Prediction:\nUse different algorithms to find " +\
            "active disease modules in a given PPI-network and combines their results. "+\
            "Uses a protein-protein-interaction-network (PPI-network) and a seed list as input\n"+\
            "Evaluation:\nEvaluate different tools with respect to the consensus of multiple tools."+\
            "If no mode is specified both functions are excecuted.")
        parser.add_argument('-n', '--ppi_network', required=True, action='store',
                help="Path to a tsv file containing the edges of the base PPI in the first two columns")
        parser.add_argument('-s','--seeds', required=True, action='store',
                help="Path to a txt file containing the seeds delimitered by breakline characters")
        parser.add_argument('-t', '--tools', nargs='+', action='store', default=["diamond", "domino", "robust"],
                help="List of tools that the user wants to use for prediction.\n Available tools are:\n" + \
                    "domino\ndiamond\nrobust\hotnet\nThe default tools are: diamond, domino and robust.")
        parser.add_argument('-ext', '--external_results', nargs='+', action='store', default=[],
                help="List of paths to results generated by external tools. The files should contain the tool name in the first line and then the predicted genes (1 gene per line)")
        parser.add_argument('-w', '--tool_weights', nargs='+', action='store', default=None,
                help="List of weights for the tools. If you have [domino, diamond, robust] as list of tools and diamonds weight should be twice as high as the other tools type: 1 2 1")
        parser.add_argument('-c', '--consensus', action='store_true', help="run only the consensus prediction part of cami")
        parser.add_argument('-var', '--seed_variation', action='store', help="repeat consensus selection multiple times (please provide the number of iterations) while removing 20 percent of the seeds.")
        parser.add_argument('-e', '--evaluate', action='store_true', help="run only the evaluation part of cami")
        parser.add_argument('-o', '--output_dir', action='store', help="path to output directory")
        parser.add_argument('-id', '--identifier', action='store', help="ID for the current excecution of cami. Defaults to a randomly generated ID")
        parser.add_argument('-tmp', '--save_temps', action='store_true', help="keep all temporary files")
        parser.add_argument('-v', '--nvenn', action='store_true', help="Visualize results using nVenn by Degradome, an external webtool. Please note that degradome can only be used for visualization with up to 5 tools.")
        parser.add_argument('-img', '--save_image', action='store_true', help="Save the venn diagram from the visualization as png. (Only possible for up to 5 tools)")
        parser.add_argument('-f', '--force', action='store_true', help="Ignore warnings and overwrite everything when excecuting CAMI.")
        parser.add_argument('-d', '--drugstone', nargs='*', action='store', default=None,
                            help="Visualize the cami module via the drugstone API. If necessary the user needs to provide a list of the two titles of the two columns that contain the symbols of the gene edges in the inputfile of the PPI network. The order needs to correspond to the order of the first two columns. The default is 'Official_Symbol_Interactor_A Official_Symbol_Interactor_B'. Please note that the symbol columns cannot be the first two columns. If the two main edges in the first two columns are correspond also the gene symbols please duplicate these columns.")
        parser.add_argument('-ncbi', '--ncbi', action='store_true',
                            help="Save the NCBI URLs and Summaries of the genes in the CAMI output.")
        parser.add_argument('-conf', '--configuration', nargs='*', action='store', default='camiconf',
                            help="Choose a configuration for the static variables.")
        parser.add_argument('-p', '--parallelization', action='store_true', 
                help="run the tools for prediction parallelized")
        parser.add_argument('-db', '--debug', action='store_true', 
                help="run CAMI with verbose outputs")
        #TODO List with additional arguments if needed by certain tools
    
        args = vars(parser.parse_args())
        if not os.path.exists(args['ppi_network']):
            raise RuntimeError('File does not exist: ' + args['ppi_network'])
    
        if not os.path.exists(args['seeds']):
            raise RuntimeError('File does not exist: ' + args['seeds'])
    
    # add name for implemented tools here:
        implemented_tools = [
                             "domino",
                             "diamond",
                             "robust",
                             "hotnet"
                             ]
        if args['tools']:
            for tool in args['tools']:
                if tool not in implemented_tools:
                    raise RuntimeError(tool + f' not implemented yet. Implemented tools are: {implemented_tools}')
    
        main(**args)