cami.py

#!/usr/bin/env python3
import os
import sys
import shutil
from algorithms.DiamondWrapper import DiamondWrapper
from algorithms.DominoWrapper import DominoWrapper
from algorithms.RobustWrapper import RobustWrapper
from algorithms.HHotNetWrapper import HHotNetWrapper
from algorithms.AlgorithmWrapper import AlgorithmWrapper
import preprocess
import uuid
import cami_suite
import argparse
import webbrowser
import random
import matplotlib.pyplot as plt
import seaborn as sb
import pandas as pd

#MC:
from configparser import ConfigParser


def main(ppi_network, seeds, tools, tool_weights, consensus, evaluate,
         output_dir, identifier, save_temps, nvenn, save_image, force, drugstone, ncbi, configuration,
         seed_variation, parallelization, external_results, debug):
    print('CAMI started')
    config = ConfigParser()
    config.read(configuration)
    seed_score = config.get('cami', 'seed_score')
    nof_tools = len(tools) + len(external_results)

    wrappers = {'diamond': DiamondWrapper(),
                'domino': DominoWrapper(),
                'robust': RobustWrapper(),
                'hotnet': HHotNetWrapper()
                }

    tool_wrappers = [wrappers[tool] for tool in tools]
    nof_cami_tools = len(wrappers)

    if tool_weights == None:
        tool_weights = nof_tools * [1]

    ext_wrappers = []
    if len(external_results) > 0:
        for idx in range(len(external_results)):
            dummy_wrapper = AlgorithmWrapper()
            dummy_wrapper.code = idx + 1 + nof_cami_tools
            ext_wrappers.append(dummy_wrapper)

    ppi_graph = preprocess.csv2graph(ppi_network, symbol_columns=drugstone, nof_tools=nof_tools)

    seed_lst = preprocess.txt2lst(seeds)
    initial_seedlst = seed_lst.copy()

    # rename seeds with their corresponding indices in the ppi_graph
    name2index = preprocess.name2index_dict(ppi_graph)
    checked_seed_lst = [name2index[v] for v in seed_lst if v in name2index]
    if debug:
        print('Created the PPI-network Graph and seed list.')
    n = len(checked_seed_lst)
    m = len(seed_lst)
    if n != m  and not force:
        name = ppi_graph.vertex_properties["name"]
        print(f'There are {abs(m-n)} seeds missing in the PPI network: ({set([name[cseed] for cseed in checked_seed_lst]).symmetric_difference(set(initial_seedlst))})')
        choice = input('Continue? [y/n]')
        if choice == 'y':
            pass
        else:
            print('Aborted because of missing seeds. Please try again.')
            exit(1)

    seed_lst = checked_seed_lst.copy()
    if debug: print(f'Continuing with vertices at indices {[int(seed) for seed in seed_lst]} as input seeds')

    # change directory to ~/cami/cami (home of cami.py)
    cami_home = sys.argv[0].rsplit('/', 1)
    os.chdir(cami_home[0])
    home_path = os.path.dirname(os.getcwd())
    if debug:
        print(f"Home directory of cami: {home_path}")
    if identifier==None:
        identifier = str(uuid.uuid4())

    if output_dir==None:
        output_dir = os.path.join(home_path, f'data/output/{identifier}')
        output_dir = os.path.abspath(output_dir)
        if debug:
            print(f"Output directory of cami: {output_dir}")
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    #create temporary directory
    tmp_dir = os.path.join(home_path, f'data/tmp/{identifier}')
    if debug:
        print(f'Creating unique temporary directory for CAMI: {tmp_dir}')

    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)

    elif not force:
        print(f'TemporaryDirectory {tmp_dir} already exists.')
        choice = input('overwrite? [y/n]')
        if choice == 'y':
            pass
            # shutil.rmtree(tmp_dir)
            # os.makedirs(tmp_dir)
        else:
            print('Abort. Please try again.')
            exit(1)

    # initialize CAMI
    cami = cami_suite.cami(ppi_graph,seed_lst,tool_wrappers,output_dir,identifier,home_path,
                           tmp_dir=tmp_dir, config=configuration, seed_score=seed_score,
                           parallelization=parallelization)

    if ncbi:
        cami.ncbi = True

    if debug:
        cami.debug = True

    for tool in tool_wrappers:
        cami.initialize_tool(tool)

    for tool in ext_wrappers:
        cami.initialize_tool(tool)

    for idx, tool in enumerate(tool_wrappers):
        tool.set_weight(float(tool_weights[idx]))

    cami.set_initial_seed_lst(initial_seedlst)

    if (consensus or (not consensus and not evaluate and not seed_variation)) and not seed_variation:
        result_sets = cami.make_predictions()
        if len(external_results) > 0:
            assert(len(ext_wrappers) == len(external_results))
            external_input = {}
            for i in range(len(ext_wrappers)):
                external_input[ext_wrappers[i]] = external_results[i]
            result_sets = cami.take_custom_results(external_input, result_sets)
        cami.create_consensus(result_sets)

        # adds the seeds to the results, right now result_sets contains the seeds that should be ADDED to the module
        # for result in result_sets.keys():
        #     result_sets[result] = result_sets[result].union(cami.seed_lst)

        if nvenn or save_image:
            print('Sending results to nVenn')
            url = cami.use_nvenn()
            if url:
                if nvenn:
                    webbrowser.open(url)
                if save_image:
                    cami.download_diagram(url)

        if drugstone is not None:
            print('Sending results to DrugstOne')
            cami.use_drugstone()
        if consensus:
            cami.reset_cami()

    if evaluate or (not consensus and not evaluate and not seed_variation):
        cami.make_evaluation()
        if evaluate:
            cami.reset_cami()

    elif (not consensus and not evaluate):
        cami.reset_cami()

    # SEED VARIATION
    if seed_variation:

        def predict_and_make_consensus(vis=False):
            result_sets = cami.make_predictions()
            cami.create_consensus(result_sets)
            if vis:
                n_results = len(cami.result_gene_sets)
                comparison_matrix = pd.DataFrame([[int(0) for _ in range(n_results)] for __ in range(n_results)],
                                                 columns = list(cami.result_gene_sets.keys()),
                                                 index = list(cami.result_gene_sets.keys()),
                                                 dtype=int)
                for algo1 in cami.result_gene_sets:
                    for algo2 in cami.result_gene_sets:
                        comparison_matrix.loc[algo1,algo2] = int(len(cami.result_gene_sets[algo1].intersection(cami.result_gene_sets[algo2])))

                fig2, ax3 = plt.subplots(figsize=(20,20))

                ax3 = sb.heatmap(comparison_matrix, annot=True, fmt='g')
                ax3.set_title('Intersections of result_gene_sets of all analyzed algorithms.')
                fig2.savefig(f'{output_dir}/heatmap_{cami.uid}.png', bbox_inches="tight")
                print(f'saved intersection heatmap of all algorithms under: {output_dir}/heatmap_{cami.uid}.png')
                plt.close(fig2)

                fig2a, ax3a = plt.subplots(figsize=(20,20))
                comparison_matrix_normalized = comparison_matrix.apply(lambda row: row/row.max(), axis=1)
                ax3a = sb.heatmap(comparison_matrix_normalized, annot=True, fmt='.2f')
                ax3a.set_title('Normalized intersections of result_gene_sets of all analyzed algorithms.')
                fig2a.savefig(f'{output_dir}/heatmap_{cami.uid}_normalized.png', bbox_inches="tight")
                print(f'saved intersection heatmap of all algorithms under: {output_dir}/heatmap_{cami.uid}_normalized.png')
                plt.close(fig2a)
            if nvenn and vis:
                url = cami.use_nvenn()
                cami.download_diagram(url)

        with open(f'{output_dir}/00_node_degrees.tsv', 'w') as node_degrees:
            node_degrees.write('vertex\tout_degree\tin_degree\n')
            for vertex in cami.ppi_graph.vertices():
                node_degrees.write(f'{cami.ppi_vertex2gene[vertex]}\t{vertex.out_degree()}\t{vertex.in_degree()}\n')

        # initialize cami and seed_var
        base_seeds = cami.origin_seed_lst
        original_seeds = [cami.ppi_vertex2gene[seed] for seed in base_seeds]
        print(f'Initializing CAMI and the seed variation by running CAMI with all given seeds:{original_seeds}')

        with open(os.path.join(home_path, f'data/output/explorativeness.tsv'), 'a') as f:
            predict_and_make_consensus(vis=True)
            seedname = seeds
            for tool in cami.result_gene_sets:
                f.write(f'\n{seedname}\t{len(cami.seed_lst)}\t{tool}\t{len(cami.result_gene_sets[tool])}')
        cami.make_evaluation()
        #predict_and_make_consensus(vis=True)

        random.seed(50)
        removal_frac = 0.2
        nof_iterations = int(seed_variation)
        used_tools = list(cami.result_gene_sets.keys())
        nof_seeds = len(base_seeds)
        nof_removals = max([int(nof_seeds * removal_frac), 1])
        variation_results = []
        result_file_1 = f'{output_dir}/00_seedvariation_results1.tsv'
        result_file_2 = f'{output_dir}/00_seedvariation_results2.tsv'
        result_file_3 = f'{output_dir}/00_seedvariation_results3.tsv'
        result_file_4 = f'{output_dir}/00_seedvariation_results4.tsv'
        n_results = len(cami.result_gene_sets)

        redisc_intersection_matrix = pd.DataFrame([[0 for _ in range(n_results)] for __ in range(n_results)],
                                                    columns = list(cami.result_gene_sets.keys()),
                                                    index = list(cami.result_gene_sets.keys()),
                                                    dtype=int)
        with open(result_file_1, 'w') as res_table1:
            with open(result_file_2, 'w') as res_table2:
                with open(result_file_3, 'w') as res_table3:
                    with open(result_file_4, 'w') as res_table4:
                        res_table1.write('id')
                        res_table2.write('id\tnof_removed_seeds\tremoved_seeds\t{used_seeds}\ttool\trediscovery_prevalence')
                        res_table2.write('\trediscovery_rate\trediscovered_seeds')
                        for tool in used_tools:
                            res_table1.write(f'\t{tool}')
                        res_table1.write('\n')
                        res_table2.write('\n')
                        res_table3.write('id\ttrue_positives\trediscoveries/module_size\n')
                        tp_rate_dict = {k:list() for k in used_tools}
                        module_size_dict = {k:list() for k in used_tools}

                        for ident in range(nof_iterations):
                            res_table1.write(f'{ident}')
                            # update uid
                            new_identifier = identifier + f'_{ident}'
                            # reset cami
                            cami.reset_cami(new_uid=new_identifier)
#                            cami.ppi_graph = original_ppi

                            #remove seeds (again)
                            print(f'Removing {nof_removals} seeds from the original seed list...')
                            removed_seeds_idx = random.sample(list(range(nof_seeds)), nof_removals)
                            removed_seeds = cami.remove_seeds(removed_seeds_idx)
                            rem_seeds = [cami.ppi_vertex2gene[seed] for seed in removed_seeds]
                            print(f'Removed: {rem_seeds} from the seed list')
                            print('Updating tools and repeat CAMI')
                            # reinitialize tools
                            cami.initialize_all_tools()

                            # repeat consensus
                            if ident%20==0:
                                predict_and_make_consensus(vis=True)
                            else:
                                predict_and_make_consensus()

                            used_seeds = [cami.ppi_vertex2gene[seed] for seed in cami.seed_lst]


                            result_dict = cami.result_gene_sets
                            redisc_rate_dict = {}
                            redisc_seeds_dict = {}

                            for tool in result_dict:
                                nof_predictions = len(result_dict[tool]) + len(used_seeds)
                                redisc_seeds = set(result_dict[tool]).intersection(set(rem_seeds))
                                redisc_prev = len(redisc_seeds)
                                redisc_rate = redisc_prev / nof_removals
                                redisc_rate_dict[tool] = redisc_rate #true positive rate: rate verhältnis von gefundenen und allgemein gefundenen
                                redisc_seeds_dict[tool] = redisc_seeds
                                tp_rate = redisc_prev / len(removed_seeds)
                                tp_rate_dict[tool].append(tp_rate)
                                module_size_frac = redisc_prev / nof_predictions
                                assert module_size_frac <= 1
                                module_size_dict[tool].append(module_size_frac)
                                res_table3.write(f'{ident}\t{tool}\t{tp_rate}\t{module_size_frac}\n')
                                res_table1.write('\t')
                                for idx,seed in enumerate(redisc_seeds):
                                    if idx == 0:
                                        res_table1.write(f'{list(redisc_seeds)[0]}')
                                    else:
                                        res_table1.write(f',{seed}')
                                print(f'{tool} rediscovered {redisc_seeds} after removing {rem_seeds}.')
                                res_table2.write(f'{ident}\t{tool}\t{nof_removals},\t{rem_seeds}\t{used_seeds}\t{redisc_prev}\t{redisc_rate}\t{redisc_seeds}\n')
                            res_table1.write('\n')
                            variation_results.append((redisc_rate_dict, redisc_seeds_dict, used_seeds, rem_seeds))
                            for algo1 in redisc_seeds_dict:
                                for algo2 in redisc_rate_dict:
                                    redisc_intersection_matrix.loc[algo1,algo2] += len(redisc_seeds_dict[algo1].intersection(redisc_seeds_dict[algo2]))
        print(f'Result tables are saved in the following locations:')
        print(f'Rediscovered seeds: {result_file_1}')
        print(f'Rediscovery Rates: {result_file_2}')
        print(f'Sensitivity: {result_file_3}')
        fig3, ax6 = plt.subplots(figsize=(20,20))
        ax6 = sb.heatmap(redisc_intersection_matrix, annot=True, fmt='g')
        ax6.set_title('Number of times the algorithms rediscovered the same seeds')
        fig3.savefig(f'{output_dir}/00_seedvariation_heatmap_{identifier}.png', bbox_inches="tight")
        print(f'saved intersection heatmap of all algorithms under:{output_dir}/heatmap_{identifier}.png')
        plt.close(fig3)
        fig3a, ax6a = plt.subplots(figsize=(20,20))
        redisc_intersection_matrix_normalized = redisc_intersection_matrix.apply(lambda row: row/row.max(), axis=1)
        ax6a = sb.heatmap(redisc_intersection_matrix_normalized, annot=True, fmt='.2f')
        ax6a.set_title('Normalized numbers of times the algorithms rediscovered the same seeds')
        fig3a.savefig(f'{output_dir}/00_seedvariation_heatmap_{identifier}_normalized.png', bbox_inches="tight")
        print(f'saved intersection heatmap of all algorithms under:{output_dir}/heatmap_normalized_{cami.uid}.png')
        plt.close(fig3a)

#        print(variation_results)
        rediscovery_rates_results = [results[0] for results in variation_results]
#        print(rediscovery_rates_results)
        tools = [tool for tool in rediscovery_rates_results[0].keys()]
        tool_labels = tools.copy()
        redisc_rates = [[res[tool] for res in rediscovery_rates_results] for tool in tools]
        for idx,tool in enumerate(tools):
            if '_' in tool:
                # find the index of the second occurrence of the character
                second_occurrence_index = tool.find('_', tool.find('_') + 1)
                if second_occurrence_index > -1:
                    # replace the character at that index with the replacement character
                    tool_name = tool[:second_occurrence_index] + '\n' + tool[second_occurrence_index + 1:]
                    tool_labels[idx] = tool_name

        #PLOT
        # Create a figure instance
        #print(sys.getrecursionlimit())
        fig1, (ax1, ax5, ax4) = plt.subplots(3, 1, figsize=(20,20))
        fig1.subplots_adjust(left=0.2)
        # Extract Figure and Axes instance

        # Create a plot
        violins1 = ax1.violinplot(redisc_rates, showmeans=True, showextrema=True)

        for violinpart in list(violins1.keys())[2:]:
            violins1[violinpart].set_color('k')

        for violin in violins1['bodies']:
            violin.set_facecolor('red')
        # Add title
        ax1.set_title(f'Rediscovery rate after randomly removing {nof_removals} seeds {nof_iterations} times from {identifier} seeds.', wrap=True, fontsize=14)

        ax1.set_xticks(list(range(1,len(tools)+1)))
        ax1.set_xticklabels(tool_labels)
        ax1.tick_params(axis='x', labelsize=11)

        ax1.set_ylabel('Rediscovery rate (<rediscovered seeds>/<removed seeds>)', wrap=True, fontsize=14)


        violins2 = ax4.violinplot([tp_rate_dict[tool] for tool in tools], showmeans=True, showextrema=True)
        for violinpart in list(violins2.keys())[2:]:
            violins2[violinpart].set_color('k')
        for violin in violins2['bodies']:
            violin.set_facecolor('orange')
        # Add title
        ax4.set_title(f'True positive rates after randomly removing {nof_removals} seeds {nof_iterations} times from {identifier} seeds.', wrap=True, fontsize=14)

        ax4.set_xticks(list(range(1,len(tools)+1)))
        ax4.set_xticklabels(tool_labels)
        ax4.tick_params(axis='x', labelsize=11)

        ax4.set_ylabel('Sensitivity (TP/TP + FN)', wrap=True, fontsize=14)

        violins3 = ax5.violinplot([module_size_dict[tool] for tool in tools], showmeans=True, showextrema=True)
        # Add title
        for violinpart in list(violins3.keys())[2:]:
            violins3[violinpart].set_color('k')

        for violin in violins3['bodies']:
            violin.set_facecolor('blue')
        ax5.set_title(f'Ratio of number of rediscovered seeds and predicted module size after removing {nof_removals} seeds {nof_iterations} times from {identifier} seeds.', wrap=True, fontsize=14)

        ax5.set_xticks(list(range(1,len(tools)+1)))
        ax5.set_xticklabels(tool_labels)

        ax5.set_ylabel('precision (<rediscovered seeds>/<module size>)', fontsize=14)
        ax5.tick_params(axis='x', labelsize=11)
        fig1.tight_layout()
        fig1.savefig(f'{output_dir}/00_{identifier}_seed_variation_result.png', bbox_inches="tight")
        plt.close(fig1)
        print(f'Violin plot saved under: 00_{identifier}_seed_variation_result.png')

    if save_temps:
        print(f'All temporary files were kept in {tmp_dir}')
        print(f'You can remove them by typing: rm -rv {tmp_dir}')
        print('Please note that CAMI internally works with the indices of the' +
               ' vertices in the PPI-Network. This means that the names in the'+
               ' inputfiles of the tools do not correspond to the gene names ' +
               'but to their indices in the graph_tool Graph that CAMI creates.'
               )
    else:
        print(f'Removing all temporary files in {tmp_dir}...')
        shutil.rmtree(tmp_dir)
        print('Done.')


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="CAMI has two functions:\n" +\
        "Consensus Prediction:\nUse different algorithms to find " +\
        "active disease modules in a given PPI-network and combines their results. "+\
        "Uses a protein-protein-interaction-network (PPI-network) and a seed list as input\n"+\
        "Evaluation:\nEvaluate different tools with respect to the consensus of multiple tools."+\
        "If no mode is specified both functions are excecuted.")
    parser.add_argument('-n', '--ppi_network', required=True, action='store',
            help="Path to a tsv file containing the edges of the base PPI in the first two columns")
    parser.add_argument('-s','--seeds', required=True, action='store',
            help="Path to a txt file containing the seeds delimitered by breakline characters")
    parser.add_argument('-t', '--tools', nargs='+', action='store', default=["diamond", "domino", "robust"],
            help="List of tools that the user wants to use for prediction.\n Available tools are:\n" + \
                "domino\ndiamond\nrobust\hotnet\nThe default tools are: diamond, domino and robust.")
    parser.add_argument('-ext', '--external_results', nargs='+', action='store', default=[],
            help="List of paths to results generated by external tools. The files should contain the tool name in the first line and then the predicted genes (1 gene per line)")
    parser.add_argument('-w', '--tool_weights', nargs='+', action='store', default=None,
            help="List of weights for the tools. If you have [domino, diamond, robust] as list of tools and diamonds weight should be twice as high as the other tools type: 1 2 1")
    parser.add_argument('-c', '--consensus', action='store_true', help="run only the consensus prediction part of cami")
    parser.add_argument('-var', '--seed_variation', action='store', help="repeat consensus selection multiple times (please provide the number of iterations) while removing 20 percent of the seeds.")
    parser.add_argument('-e', '--evaluate', action='store_true', help="run only the evaluation part of cami")
    parser.add_argument('-o', '--output_dir', action='store', help="path to output directory")
    parser.add_argument('-id', '--identifier', action='store', help="ID for the current excecution of cami. Defaults to a randomly generated ID")
    parser.add_argument('-tmp', '--save_temps', action='store_true', help="keep all temporary files")
    parser.add_argument('-v', '--nvenn', action='store_true', help="Visualize results using nVenn by Degradome, an external webtool. Please note that degradome can only be used for visualization with up to 5 tools.")
    parser.add_argument('-img', '--save_image', action='store_true', help="Save the venn diagram from the visualization as png. (Only possible for up to 5 tools)")
    parser.add_argument('-f', '--force', action='store_true', help="Ignore warnings and overwrite everything when excecuting CAMI.")
    parser.add_argument('-d', '--drugstone', nargs='*', action='store', default=None,
                        help="Visualize the cami module via the drugstone API. If necessary the user needs to provide a list of the two titles of the two columns that contain the symbols of the gene edges in the inputfile of the PPI network. The order needs to correspond to the order of the first two columns. The default is 'Official_Symbol_Interactor_A Official_Symbol_Interactor_B'. Please note that the symbol columns cannot be the first two columns. If the two main edges in the first two columns are correspond also the gene symbols please duplicate these columns.")
    parser.add_argument('-ncbi', '--ncbi', action='store_true',
                        help="Save the NCBI URLs and Summaries of the genes in the CAMI output.")
    parser.add_argument('-conf', '--configuration', nargs='*', action='store', default='camiconf',
                        help="Choose a configuration for the static variables.")
    parser.add_argument('-p', '--parallelization', action='store_true',
            help="run the tools for prediction parallelized")
    parser.add_argument('-db', '--debug', action='store_true',
            help="run CAMI with verbose outputs")
    #TODO List with additional arguments if needed by certain tools

    args = vars(parser.parse_args())
    if not os.path.exists(args['ppi_network']):
        raise RuntimeError('File does not exist: ' + args['ppi_network'])

    if not os.path.exists(args['seeds']):
        raise RuntimeError('File does not exist: ' + args['seeds'])

# add name for implemented tools here:
    implemented_tools = [
                         "domino",
                         "diamond",
                         "robust",
                         "hotnet"
                         ]
    if args['tools']:
        for tool in args['tools']:
            if tool not in implemented_tools:
                raise RuntimeError(tool + f' not implemented yet. Implemented tools are: {implemented_tools}')

    main(**args)