cami_suite.py

import threading, biodigest, os
from utils import drugstone, degradome, ncbi
from algorithms.DiamondWrapper import DiamondWrapper
from algorithms.DominoWrapper import DominoWrapper
from algorithms.RobustWrapper import RobustWrapper
from configparser import ConfigParser
import preprocess
from consensus import cami_v1, cami_v2, cami_v3


def list_combinations(lst, k):
    """creates all possible combinations of length k with two objects in a list

    :param lst: a list with length 2
    :type lst: list()
    :param k: length of the combinations
    :type k: int
    """
    nof_combs = int(2 ** k)
    l = int(nof_combs / 2)
    columns = []
    while l >= 1:
        column = []
        while len(column) < nof_combs:
            for _ in range(l):
                column.append(lst[0])
            for _ in range(l):
                column.append(lst[1])
        columns.append(column)
        l = l/2
        if l >= 1:
            l=int(l)
    combs = [tuple([column[i] for column in columns]) for i in range(nof_combs)]
    assert len(set(combs)) == nof_combs
    return(combs)

def initialize_cami(path_to_ppi_file=''):
    cami_params = {}
    # find homepath aka ~/cami
    current_wd = os.getcwd()
    current_wd_lst = current_wd.rsplit('/', 1)
    current_folder = current_wd_lst[-1]
    while current_folder != 'cami':
        os.chdir(current_wd_lst[0])
        current_wd = os.getcwd()
        current_wd_lst = current_wd.rsplit('/', 1)
        current_folder = current_wd_lst[-1]
    home_path = current_wd
    cami_params['home_path'] = home_path
    cami_source = os.path.join(home_path, 'cami_src')
    cami_params['cami_src_path'] = cami_source

    # initialize tool wrappers
    diamond = DiamondWrapper()
    domino = DominoWrapper()
    robust = RobustWrapper()
    wrappers = [diamond, domino, robust]
    nof_tools = len(wrappers)
    cami_params['tool_wrappers'] = wrappers

    # preprocessing
    if path_to_ppi_file == '':
        ppi_network = 'example_network.tsv'
        ppi_file = os.path.join(home_path, f'data/input/networks/{ppi_network}')
    else:
        ppi_file = path_to_ppi_file

    symbol_columns = [] # if the two symbol columns in the ppi_network file are not named
                            # ['Official_Symbol_Interactor_A', 'Official_Symbol_Interactor_B']
                            # provide the names here.
    ppi_graph = preprocess.csv2graph(ppi_file, symbol_columns, nof_tools)

    cami_params['ppi_graph'] = ppi_graph

    # dictionary with name of the seeds and file
    seed_directory = os.path.join(home_path, 'data/input/seeds')
    seed_files = {'adhd':'adhd.tsv',\
                'alcl':'alcl.tsv',\
                'joubert':'joubert_syndrome.tsv'}
    seed_paths = {}
    for seed_file in seed_files:
        seed_paths[seed_file] = os.path.join(seed_directory, seed_files[seed_file])

    seed_lists = {seedname:preprocess.txt2lst(seed_paths[seedname]) for seedname in seed_paths}


class cami():
    """ A module that is used for Active Module identifaction based on a
        consensus approach
    """
    def __init__(self, ppi_graph, seed_lst, tool_wrappers, output_dir, uid, home_path, tmp_dir='', config='camiconf', seed_score=10, parallelization=False):
        """Instance variables of CAMI

        :param ppi_graph: The PPI-Graph on which all predictions in CAMI are based of
        :type ppi_graph: Graph()
        :param seed_lst: A list of vertices that are the seeds for the predictions
        :type seed_lst: Graph().vertex()
        :param tool_wrappers: A list of AlgorithmWrappers() that correspond to the tools used for the predictions
        :type tool_wrappers: list(AlgorithmWrapper())
        :param output_dir: The path to the directory where the results are supposed to be saved
        :type output_dir: str
        :param uid: Identifier for the current excecution of CAMI
        :type uid: str
        :param tmp_dir: Directory where temporary files should be saved
        :type tmp_dir: str
        :param home_path: Path to the cami home directory (gitlab repository)
        :type home_path: str
        """
        self.ppi_graph = ppi_graph
        self.origin_ppi_graph = ppi_graph.copy()
        self.ppi_vertex2gene = self.ppi_graph.vertex_properties["name"]
        self.ppi_gene2vertex = {self.ppi_vertex2gene[vertex]:vertex for vertex in self.ppi_graph.vertices()}
        self.initial_seed_lst = None
        self.seed_lst = seed_lst
        self.origin_seed_lst = seed_lst.copy()
        self.tool_wrappers = tool_wrappers
        self.output_dir = output_dir
        self.home_path = home_path
        self.uid = str(uid)
        if tmp_dir == '':
            tmp_dir = os.path.join(home_path, 'data', 'tmp', self.uid)
        self.tmp_dir = tmp_dir

        self.nof_tools = len(tool_wrappers)
        self.result_gene_sets = {} #contains the genes predicted by the tools (not the indices)
        self.code2toolname = {tool.code:tool.name for tool in self.tool_wrappers}
        self.code2toolname[0] = 'CAMI'
        self.cami_vertices = []
        self.ncbi = False

        config = ConfigParser()
        config.read('camiconf')
        self.seed_score = config.get('cami', 'seed_score')
        self.config = config
        self.threaded = parallelization
        # set weights for seed genes in ppi_graph
        for seed in self.seed_lst:
            self.ppi_graph.vertex_properties["cami_score"][seed] = self.seed_score

    def reset_cami(self, new_uid='', change_tmp=False):
        if not new_uid == '':
            self.uid = new_uid
        if change_tmp:
            new_tmp_dir = os.path.join(self.home_path,
                                        'data',
                                        self.uid)
            os.makedirs(new_tmp_dir)
            self.tmp_dir = new_tmp_dir
        self.ppi_graph = self.origin_ppi_graph.copy()
        self.result_gene_sets = {}
        self.cami_vertices = []
        self.seed_lst = self.origin_seed_lst.copy()

    def set_initial_seed_lst(self, seedlst):
        self.initial_seed_lst = seedlst

    def initialize_tool(self, tool):
        tool.set_ppi_network(self.ppi_graph)
        tool.set_seeds(self.seed_lst)
        tool.set_homepath(self.home_path)
        tool.set_id(self.uid)
        tool.set_config(self.config)

    def initialize_all_tools(self):
        for tool in self.tool_wrappers:
            self.initialize_tool(tool)

    def run_tool(self, tool):
        """Excecute the predictions using the AlgorithmWrapper() of a tool

        :param tool: A tool that has the following methods: prepare_input, run_tool() and extract_output()
        :type tool: AlgorithmWrapper()
        :return: A set of predicted vertices by the used tool
        :rtype: set()
        """
        tool.create_tmp_output_dir(self.tmp_dir) # creates the temporary input directory
        print(f"preparing {tool.name} input...")
        inputparams = tool.prepare_input()
        print(f'running {tool.name}...')
        preds = set(tool.run_algorithm(inputparams))
        print(f'{tool.name} predicted {len(preds)} active vertices (seeds not excluded):')
        print(preds)
        return preds

    def make_evaluation(self):
        print(self.result_gene_sets)
        biodigest.setup.main(setup_type="api")
        for result_set in self.result_gene_sets:
            validation_results = biodigest.single_validation.single_validation(
                tar=set(self.result_gene_sets[result_set]),
                tar_id='entrez',
                mode='set-set',
                distance='jaccard',
                ref=set(self.seed_lst),
                ref_id='entrez')
            if validation_results['status'] == 'ok':
                biodigest.single_validation.save_results(validation_results, f'{result_set}_{self.uid}', self.output_dir)
                biodigest.evaluation.d_utils.plotting_utils.create_plots(results=validation_results,
                             mode='set-set',
                             tar=set(self.result_gene_sets[result_set]),
                             tar_id='entrez',
                             out_dir=self.output_dir,
                             prefix=f'{result_set}_{self.uid}')
    def run_threaded_tool(self, tool, pred_sets):
        """run a tool in one thread and save the results into a dictionary pred_sets

        Args:
            tool (AlgorithmWrapper): Wrapper class for a tool
            pred_sets (dict): a dictionary that maps a tool to its result set
        """
        preds = self.run_tool(tool)
        pred_sets[tool] = preds #- seed_set

    def make_predictions(self) -> dict:
        """create all predictions using the tools specified in tool_wrappers

        :return: A dictionary that saves the predicted vertices with respect
                 to the corresponding tool
        :rtype: dict(AlgorithmWrapper():set(Graph.vertex()))
        """
        print(f'Creating result sets of all {self.nof_tools} tools...')
        pred_sets = {tool:None for tool in self.tool_wrappers}

        if self.threaded:
            threads = [threading.Thread(target=self.run_threaded_tool, args=(tool, pred_sets,))
                    for tool in self.tool_wrappers]
            for thread in threads:
                thread.start()

            for thread in threads:
                thread.join()
        else:
            for tool in self.tool_wrappers:
                pred_sets[tool] = self.run_tool(tool)

        assert(list(pred_sets.values()).count(None) < 1)
        result_sets = {tool:set([self.ppi_graph.vertex(idx) for idx in pred_sets[tool]])
                       for tool in pred_sets}
        return result_sets

    def take_custom_results(self, inputfiles, result_sets={}):
        """Takes a list of inputfiles and extracts the results from them to
           include them in the consensus with the tools of CAMI

        :param inputfiles: A list of dictionaries with the following properties:
                           key: The used tool name
                           values: the paths to result files of these tools
        :type inputfiles: list(dict)
        :return: A dictionary that saves the predicted vertices with respect
                 to the corresponding tool
        :rtype: dict(AlgorithmWrapper():set(Graph.vertex()))
        """
        for tool in inputfiles:
            result_list = []
            with open(inputfiles[tool]) as rfile:
                for idx, line in enumerate(rfile):
                    if idx == 0:
                        tool.name = line.strip()
                        self.code2toolname[tool.code] = tool.name
                    else:
                        node = line.strip()
                        if node in self.ppi_gene2vertex:
                            result_list.append(self.ppi_gene2vertex[node])
                result_sets[tool] = set(result_list)
        return result_sets

    def create_consensus(self, result_sets):
        """takes a set of active module predictions and creates a consensus
           that combines all the results of the different tools.

        :param result_sets: A dictionary with the following properties:
                          key: The used tool as AlgorithmWrapper() Object
                          values: Set of vertices in the ppi_graph that were
                                  predicted by the key-tool
        :type result_sets: {AlgorithmWrapper(): {Graph().vertex()}}
        """
        # calculate gene weights
        # set of all result genes
        cami_scores = self.ppi_graph.vertex_properties["cami_score"]
        predicted_by = self.ppi_graph.vertex_properties["predicted_by"]
        consens_threshold = min(self.nof_tools, 2)
        ppi_graph = self.ppi_graph
        seed_list = self.seed_lst
        tool_name_map = self.code2toolname
        gene_name_map = self.ppi_vertex2gene

        camis = {
            'cami_v1': {'function': cami_v1.run_cami, 'params': {'consens_threshold': consens_threshold}},
            'cami_v2_param1_tr': {'function': cami_v2.run_cami, 'params': {
                'hub_penalty': 0.8, 'damping_factor': 0.5, 'confidence_level': 0.75
            }},
            'cami_v2_param1_bc': {'function': cami_v2.run_cami, 'params': {
                'hub_penalty': 0.8, 'damping_factor': 0.5, 'confidence_level': 0.75, 'ranking': 'betweenness'
            }},
            'cami_v2_param1_m': {'function': cami_v2.run_cami, 'params': {
                'hub_penalty': 0.8, 'damping_factor': 0.5, 'confidence_level': 0.75, 'ranking': 'must'
            }},
            'cami_v2_param2_tr': {'function': cami_v2.run_cami, 'params': {
                'hub_penalty': 0, 'damping_factor': 0.7, 'confidence_level': 0.5
            }},
            'cami_v2_param2_m': {'function': cami_v2.run_cami, 'params': {
                'hub_penalty': 0, 'damping_factor': 0.7, 'ranking': 'must',
            }},
            'cami_v2_param2_bc': {'function': cami_v2.run_cami, 'params': {
                'hub_penalty': 0, 'damping_factor': 0.7, 'confidence_level': 0.5, 'ranking': 'betweenness'
            }},
            'cami_v3_param1_tr': {'function': cami_v3.run_cami, 'params': {
                'hub_penalty': 0.8, 'damping_factor': 0.5, 'confidence_level': 0.75
            }},
            'cami_v3_param1_bc': {'function': cami_v3.run_cami, 'params': {
                'hub_penalty': 0.8, 'damping_factor': 0.5, 'confidence_level': 0.75, 'ranking': 'betweenness'
            }},
            'cami_v3_param1_m': {'function': cami_v3.run_cami, 'params': {
                'hub_penalty': 0.8, 'damping_factor': 0.5, 'confidence_level': 0.75, 'ranking': 'must'
            }},
            'cami_v3_param2_tr': {'function': cami_v3.run_cami, 'params': {
                'hub_penalty': 0, 'damping_factor': 0.7, 'confidence_level': 0.5, 'ranking': 'trustrank'
            }},
            'cami_v3_param2_bc': {'function': cami_v3.run_cami, 'params': {
                'hub_penalty': 0, 'damping_factor': 0.7, 'confidence_level': 0.5, 'ranking': 'betweenness'
            }},
            'cami_v3_param2_m': {'function': cami_v3.run_cami, 'params': {
                'hub_penalty': 0, 'damping_factor': 0.7, 'confidence_level': 0.5, 'ranking': 'must'
            }},
            'cami_v3_param3_m': {'function': cami_v3.run_cami, 'params': {
                'hub_penalty': 0, 'damping_factor': 0.7, 'confidence_level': 0.5, 'ranking': 'must', 'trees': 15
            }},
        }

        for cami_method_name, cami_params in camis.items():
            print("Running " + cami_method_name)
            cami_vertices, putative_vertices, codes2tools = cami_params['function'](result_sets, ppi_graph, seed_list,
                                                                                    predicted_by, cami_scores,
                                                                                    tool_name_map,
                                                                                    cami_params['params'])

            # sort the resulting vertices according to their cami_score
            cami_vlist = sorted(cami_vertices, key=lambda v: cami_scores[v], reverse=True)

            seed_genes = [self.ppi_vertex2gene[seed_vertex] for seed_vertex in seed_list]
            # translate the resulting vertex() ids to the corresponding names in the ppi network
            cami_genes = [self.ppi_vertex2gene[cami_vertex] for cami_vertex in cami_vlist]

            print(f'With the given seed genes: {seed_genes} \n' +
                  f'CAMI ({cami_method_name}) proposes the following genes to add to the Active Module (sorted by CAMI Score):')
            for vertex in cami_vlist:
                print(f'{gene_name_map[vertex]}\t{cami_scores[vertex]}\t{codes2tools[vertex]}')
            # for visualization
            self.result_gene_sets[cami_method_name] = cami_genes

            if cami_method_name == 'cami_v1':
                # for drugstone
                self.cami_vertices = cami_vlist

            # save the results in outputfiles
            self.generate_output(cami_method_name, seed_genes, cami_vlist, cami_vertices, putative_vertices, cami_genes,
                                 gene_name_map, codes2tools, result_sets, cami_scores)

    def generate_output(self, cami_method, seed_genes, cami_vlist, cami_vertices, putative_vertices, cami_genes,
                            gene_name_map, codes2tools, result_sets, cami_scores):
        # save all predictions by all tools
        print('Saving the results...')
        with open(f'{self.output_dir}/all_predictions_{self.uid}.tsv', 'w') as outputfile:
            outputfile.write(f'CAMI predictions with {len(self.seed_lst)} of initially {len(self.initial_seed_lst)} seeds: {seed_genes},\n'+
                             f'initially: {self.initial_seed_lst}\n')
            outputfile.write(f'gene\tpredicted_by\tcami_score\tindex_in_graph\tdegree_in_graph\n')
            all_vertices = cami_vertices.union(putative_vertices)
            for vertex in all_vertices:
                outputfile.write(f'{gene_name_map[vertex]}\t{codes2tools[vertex]}\t{cami_scores[vertex]}\t{str(vertex)}\t{vertex.out_degree()}\n')
        print(f'saved all predictions by the used tools in: {self.output_dir}/all_predictions_{self.uid}.tsv')

        # save the predictions made by cami
        ncbi_url = ('\tncbi_url' if self.ncbi else '')
        ncbi_summary = ('\tncbi_summary' if self.ncbi else '')

        with open(f'{self.output_dir}/CAMI_output_{self.uid}.tsv', 'w') as outputfile:
            outputfile.write(f'gene\tindex_in_graph\tcami_score\tdegree_in_graph{ncbi_url}{ncbi_summary}\n')
            for vertex in cami_vlist:
                if self.ncbi:
                    url, summary = ncbi.send_request(gene_name_map[vertex])
                    url = '\t' + url
                    if summary is not None:
                        summary = '\t' + summary
                    else:
                        summary = ''
                else:
                    url, summary = '',''
                outputfile.write(f'{gene_name_map[vertex]}\t{str(vertex)}\t{cami_scores[vertex]}\t{vertex.out_degree()}{url}{summary}\n')

        # save the whole module
        whole_module = []
        with open(f'{self.output_dir}/CAMI_module_{cami_method}_{self.uid}.txt', 'w') as modfile:
                for vertex in seed_genes:
                    modfile.write(f'{vertex}\n')
                    whole_module.append(vertex)
                for vertex in cami_genes:
                    modfile.write(f'{vertex}\n')
                    whole_module.append(vertex)

        print(f'saved cami output in: {self.output_dir}/CAMI_output_{self.uid}.tsv')
        print(f'saved the Consensus Active Module by CAMI in: {self.output_dir}/CAMI_nodes_{cami_method}_{self.uid}.txt')

        # transform all vertex indices to their corresponding gene names in a result set
        for tool in result_sets:
            self.result_gene_sets[tool.name] = set([gene_name_map[vertex] for vertex in result_sets[tool]])

        # save predictions by the other tools
        for tool in self.result_gene_sets:
            with open(f'{self.output_dir}/{tool}_output_{self.uid}.tsv', 'w') as outputfile:
                outputfile.write('gene\n')
                for gene in self.result_gene_sets[tool]:
                    outputfile.write(f'{gene}\n')
            print(f'saved {tool} output in: {self.output_dir}/{tool}_output_{self.uid}.tsv')

        # for drugstone
        self.cami_vertices = cami_vlist

        # return values
        consensus = {}
        consensus['module'] = whole_module
        consensus['seeds'] = self.seed_lst


    def use_nvenn(self):
        """Create Venn Diagrams via a external tool named degradome.
           Sends a request via requests to the degradome server.
           Returns the URL of the result.
        """
        # visualize with degradome
        if self.nof_tools < 7:
            print('Visualizing results using Degradome...')
            degradome_sets = {tool:self.result_gene_sets[tool]
                              for tool in self.result_gene_sets
                              if len(self.result_gene_sets[tool])>0}
            url = degradome.send_request(degradome_sets, {self.ppi_vertex2gene[seed] for seed in self.seed_lst})
            with open(f'{self.output_dir}/venn_link_{self.uid}.txt', 'w') as f:
                f.write(url)
            return url

        # elif nof_tools == 6:
        #     print('Visualizing using Degradome...(seeds excluded from results)')
        #     # degradome_sets = result_sets.copy()
        #     # degradome_sets['CAMI'] = set(result_genes)
        #     url = degradome.send_request(degradome_sets)
        #     webbrowser.open(url)
        else:
            print('Cannot use degradome to create venn diagrams of 6 or more tools')

    def download_diagram(self, url):
        venn_name = f'{self.output_dir}/vdiagram_{self.uid}'
        response = degradome.download_image(url, venn_name + '.png')
        if response is not None:
            with open(f'{venn_name}.html', 'w') as r:
                r.write(response.html.html)

    def use_drugstone(self):
        symbol = self.ppi_graph.vertex_properties["symbol"]
        cami_module = self.cami_vertices + self.seed_lst
        cami_symbols = [symbol[vertex] for vertex in cami_module]
        cami_symbols.append
        cami_symbol_edges = []

        for vertex in self.cami_vertices:
            for edge in vertex.all_edges():
                cami_symbol_edges.append((symbol[edge.source()], symbol[edge.target()]))
        #print(list(set(cami_symbol_edges)))
        url = drugstone.send_request(cami_symbols, cami_symbol_edges)
        print(f'You can find a network visualization of the CAMI module via: {url}')
        print('The link was also saved in the outputfolder for later.')
        with open(f'{self.output_dir}/drugstone_link_{self.uid}.txt', 'w') as f:
            f.write(url)
        return url

    def remove_seeds(self, idx_lst):
        """remove seeds at indices idx

        Args:
            idx_lst (lst): list of indices to be removed
        """
        removed_seeds = [self.seed_lst[idx] for idx in idx_lst]
        self.seed_lst = [seed for seed in self.seed_lst if seed not in removed_seeds]
        for seed in self.seed_lst:
            self.ppi_graph.vertex_properties["cami_score"][seed] = self.seed_score
        for seed in removed_seeds:
            self.ppi_graph.vertex_properties["cami_score"][seed] = 0.0
        return removed_seeds