fixed usage of predicted_by vertex_property

0c89a26d · Le, Mia · c0adfa23 · 0c89a26d · 0c89a26d · 0c89a26d
Commit 0c89a26d authored Mar 11, 2023 by Le, Mia
--- a/cami_src/algorithms/AlgorithmWrapper.py
+++ b/cami_src/algorithms/AlgorithmWrapper.py
@@ -16,6 +16,7 @@ class AlgorithmWrapper(object):
        self.home_path = ''
        self.config = 'camiconf'
        self.code = 99
+        self.debug = False
        
    def set_weight(self, weight):
        self.weight = weight

--- a/cami_src/algorithms/DiamondWrapper.py
+++ b/cami_src/algorithms/DiamondWrapper.py
@@ -63,6 +63,7 @@ class DiamondWrapper(AlgorithmWrapper):
                file.write(f"{str(edge.source())},{str(edge.target())}\n")
        inputparams.append(ppi_file)
        assert os.path.exists(ppi_file), f'Could create PPI-network file "{ppi_file}"'
+        if self.debug:
            print(f'{self.name} ppi is saved in {ppi_file}')

        # create seed file
@@ -71,6 +72,7 @@ class DiamondWrapper(AlgorithmWrapper):
            for seed in self.seeds:
                file.write(f"{seed}\n")
        assert os.path.exists(seed_file), f'Could create seed file "{seed_file}"'
+        if self.debug:
            print(f'{self.name} seeds are saved in {seed_file}')
        inputparams.append(seed_file)

@@ -79,6 +81,7 @@ class DiamondWrapper(AlgorithmWrapper):
        #MC:
        #CONFIG pred_factor = 10, max_preds = 100
        nof_preds = min([nof_seeds * self.pred_factor, self.max_preds])
+        if self.debug:
            print(f'With {nof_seeds} seeds, {self.name} will try to predict {nof_preds} active modules.')
        inputparams.append(nof_preds)
        return inputparams

--- a/cami_src/algorithms/DominoWrapper.py
+++ b/cami_src/algorithms/DominoWrapper.py
@@ -64,6 +64,7 @@ class DominoWrapper(AlgorithmWrapper):
        """
        inputparams = []
        # prepare inputfiles
+        if self.debug:
            print(f'creating {self.name} input files in {self.output_dir}')

        ppi_filename = self.name_file('ppi', 'sif')
@@ -87,15 +88,18 @@ class DominoWrapper(AlgorithmWrapper):
            for seed in self.seeds:
                file.write(f"{seed}_\n")
        inputparams.append(seed_file)
+        if self.debug:
            print(f'{self.name} seeds are saved in {seed_file}')

        slices_filename = self.name_file('slices')
        slices_output = os.path.join(self.output_dir, slices_filename)

        if not os.path.exists(slices_output):
+            if self.debug:
                print('creating domino slices_file...')
            command = f'slicer --network_file "{ppi_file}" --output_file "{slices_output}"'
            subprocess.call(command, shell=True, stdout=subprocess.PIPE)
+        if self.debug:
            print(f'{self.name} slices are saved in {slices_output}')
        inputparams.append(slices_output)
        return inputparams

--- a/cami_src/algorithms/RobustWrapper.py
+++ b/cami_src/algorithms/RobustWrapper.py
@@ -82,11 +82,13 @@ class RobustWrapper(AlgorithmWrapper):
            for edge in self.ppi_network.edges():
                file.write(f"{str(edge.source())}\t{str(edge.target())}\n")
        inputparams.append(ppi_file)
+        if self.debug:
            print(f'{self.name} ppi is saved in {ppi_file}')

        with open(seed_file, "w") as file:
            for seed in self.seeds:
                file.write(f"{seed}\n")
+        if self.debug:
            print(f'{self.name} seeds are saved in {seed_file}')
        inputparams.append(seed_file)


--- a/cami_src/cami.py
+++ b/cami_src/cami.py
@@ -136,8 +136,9 @@ def main(ppi_network, seeds, tools, tool_weights, consensus, evaluate,
            result_sets = cami.take_custom_results(external_input, result_sets)
        cami.create_consensus(result_sets)

-        for result in result_sets.keys():
-            result_sets[result] = result_sets[result].union(cami.seed_lst)
+        # adds the seeds to the results, right now result_sets contains the seeds that should be ADDED to the module
+        # for result in result_sets.keys():
+        #     result_sets[result] = result_sets[result].union(cami.seed_lst)

        if nvenn or save_image:
            print('Sending results to nVenn')
@@ -169,11 +170,6 @@ def main(ppi_network, seeds, tools, tool_weights, consensus, evaluate,
            if nvenn and vis:
                url = cami.nvenn()
                cami.download_diagram(url)
-        with open('/Users/Mia/cami_local/cami/data/output/explorativeness.tsv', 'a') as f:
-            make_consensus(vis=True)
-            seedname = seeds
-            for tool in cami.result_gene_sets:
-                f.write(f'\n{seedname}\t{len(cami.seed_lst)}\t{tool}\t{len(cami.result_gene_sets[tool])}')
                
        with open(f'{output_dir}/00_node_degrees.tsv', 'w') as node_degrees:
            node_degrees.write('vertex\tout_degree\tin_degree\n')
@@ -184,7 +180,15 @@ def main(ppi_network, seeds, tools, tool_weights, consensus, evaluate,
        base_seeds = cami.origin_seed_lst
        original_seeds = [cami.ppi_vertex2gene[seed] for seed in base_seeds]
        print(f'Initializing CAMI and the seed variation by running CAMI with all given seeds:{original_seeds}')
+        
+        with open('/Users/Mia/cami_local/cami/data/output/explorativeness.tsv', 'a') as f:
+            make_consensus(vis=True)
+            seedname = seeds
+            for tool in cami.result_gene_sets:
+                f.write(f'\n{seedname}\t{len(cami.seed_lst)}\t{tool}\t{len(cami.result_gene_sets[tool])}')
+        
        #make_consensus(vis=True)
+        
        random.seed(50)
        removal_frac = 0.2
        nof_iterations = int(seed_variation)
@@ -215,10 +219,11 @@ def main(ppi_network, seeds, tools, tool_weights, consensus, evaluate,
                            res_table1.write(f'{ident}')
                            # update uid
                            new_identifier = identifier + f'_{ident}'
+                            # reset cami
                            cami.reset_cami(new_uid=new_identifier)
 #                            cami.ppi_graph = original_ppi
                            
-                            #remove seeds
+                            #remove seeds (again)
                            print(f'Removing {nof_removals} seeds from the original seed list...')
                            removed_seeds_idx = random.sample(list(range(nof_seeds)), nof_removals)
                            removed_seeds = cami.remove_seeds(removed_seeds_idx)
@@ -273,13 +278,19 @@ def main(ppi_network, seeds, tools, tool_weights, consensus, evaluate,
        rediscovery_rates_results = [results[0] for results in variation_results]
 #        print(rediscovery_rates_results)
        tools = [tool for tool in rediscovery_rates_results[0].keys()]
+        for idx,tool in enumerate(tools):
+            if '_' in tool:
+                tmp_lst = tool.split('_')
+                linebreak_pos = len(tmp_lst)//2
+                added_linebreak_lst = tmp_lst[:linebreak_pos] + ['\n'] + tmp_lst[linebreak_pos]
+                tools[idx] = ''.join(added_linebreak_lst)
        redisc_rates = [[res[tool] for res in rediscovery_rates_results] for tool in tools]
        

        #PLOT
        
        # Create a figure instance
-        plt.figure(figsize=(16,6))
+        plt.figure(figsize=(50,8))
        
        # Extract Figure and Axes instance
        ax1 = plt.subplot(1,2,1, label='ax1')
@@ -331,55 +342,6 @@ def main(ppi_network, seeds, tools, tool_weights, consensus, evaluate,
        plt.savefig(f'{output_dir}/00_{identifier}_seed_variation_result.png')
        
        print(f'Violin plot saved under: 00_{identifier}_seed_variation_result.png')
-        # plot TP Rate
-        
-        # Extract Figure and Axes instance
-        fig2, ax2 = plt.subplots()
-
-        colors = ['red', 'blue', 'black', 'purple']
-        legend = []
-        # Create a plot
-        for idx,tool in enumerate(used_tools):
-            scatter = ax2.scatter(list(range(1,nof_iterations + 1)),tp_rate_dict[tool], color=colors[idx])
-            legend.append(scatter)
-        
-        plt.legend(legend,
-                   used_tools)
-        # Add title
-        ax2.set_title(f'Sensitivity (TP/TP + FN) in {nof_iterations} iterations.', wrap=True)
-
-        ax2.set_xticks(list(range(1,nof_iterations + 1)))
-        ax2.set_xticklabels([idx if idx%5==0 else '' for idx in range(1,nof_iterations+1)])
-        ax2.set_xlabel('Iterations')
-        ax2.set_ylabel('Sensitivity (TP/TP + FN)')
-        
-        # Save the figure
-        sensitivity_file = f'{output_dir}/00_{identifier}_seed_variation_tp_rates.png'
-        fig2.savefig(sensitivity_file)
-        print(f'Sensitivity plot saved under {sensitivity_file}')
-        
-        
-        # plot module size frac
-        fig3, ax3 = plt.subplots()
-        legend = []
-        for idx,tool in enumerate(used_tools):
-            scatter = ax3.scatter(list(range(1,nof_iterations + 1)), module_size_dict[tool], color=colors[idx])
-            legend.append(scatter)
-        
-        plt.legend(legend,
-                   used_tools)
-        # Add title
-        ax3.set_title(f'Ratio of number of rediscovered seeds and CAMI module size', wrap=True)
-
-        ax3.set_xticks((list(range(1,nof_iterations + 1))))
-        ax3.set_xticklabels([idx if idx%5==0 else '' for idx in range(1,nof_iterations+1)])
-        ax3.set_xlabel('Iterations')
-        ax3.set_ylabel('Module size ratio (<rediscovered seeds>/<module size>)')
-        
-        # Save the fig1ure
-        size_file = f'{output_dir}/00_{identifier}_redisc_modulesize_rate.png'
-        fig3.savefig(size_file)
-        print(f'Sensitivity plot saved under {size_file}')
        
    if save_temps:
        print(f'All temporary files were kept in {tmp_dir}')

--- a/cami_src/cami_suite.py
+++ b/cami_src/cami_suite.py
@@ -107,6 +107,7 @@ class cami():
        :param home_path: Path to the cami home directory (gitlab repository)
        :type home_path: str
        """
+        self.debug = False   
        self.ppi_graph = ppi_graph
        self.origin_ppi_graph = ppi_graph.copy()
        self.ppi_vertex2gene = self.ppi_graph.vertex_properties["name"]
@@ -123,10 +124,11 @@ class cami():
        self.tmp_dir = tmp_dir
        
        self.nof_tools = len(tool_wrappers)
-        self.result_gene_sets = {} #contains the genes predicted by the tools (not the indices)
+        self.result_gene_sets = {} #contains the genes predicted by the tools (not the indices) WITHOUT seeds
+        self.result_module_sets = {}  #contains the genes predicted by the tools (not the indices) WITH seeds
+        self.cami_module = [] # TODO: pick place where cami_module is set, which consensus approach should we use?
        self.code2toolname = {tool.code:tool.name for tool in self.tool_wrappers}
-        self.code2toolname[0] = 'CAMI'
-        self.cami_vertices = []
+        self.code2toolname[0] = 'No tool'
        self.ncbi = False
        
        config = ConfigParser()
@@ -149,8 +151,11 @@ class cami():
            self.tmp_dir = new_tmp_dir
        self.ppi_graph = self.origin_ppi_graph.copy()
        self.result_gene_sets = {}
+        self.result_module_sets = {}
        self.cami_vertices = []
        self.seed_lst = self.origin_seed_lst.copy()
+        self.code2toolname = {tool.code:tool.name for tool in self.tool_wrappers}
+        self.code2toolname[0] = 'No tool'

    def set_initial_seed_lst(self, seedlst):
        self.initial_seed_lst = seedlst
@@ -184,11 +189,10 @@ class cami():
        return preds

    def make_evaluation(self):
-        print(self.result_gene_sets)
        biodigest.setup.main(setup_type="api")
-        for result_set in self.result_gene_sets:
+        for result_set in self.result_module_sets:
            validation_results = biodigest.single_validation.single_validation(
-                tar=set(self.result_gene_sets[result_set]),
+                tar=set(self.result_module_sets[result_set]),
                tar_id='entrez',
                mode='set-set',
                distance='jaccard',
@@ -198,7 +202,7 @@ class cami():
                biodigest.single_validation.save_results(validation_results, f'{result_set}_{self.uid}', self.output_dir)
                biodigest.evaluation.d_utils.plotting_utils.create_plots(results=validation_results, 
                             mode='set-set', 
-                             tar=set(self.result_gene_sets[result_set]), 
+                             tar=set(self.result_module_sets[result_set]), 
                             tar_id='entrez', 
                             out_dir=self.output_dir, 
                             prefix=f'{result_set}_{self.uid}')
@@ -285,6 +289,10 @@ class cami():
        tool_name_map = self.code2toolname
        gene_name_map = self.ppi_vertex2gene
        
+        # remove seeds from result sets
+        for tool in result_sets:
+            result_sets[tool] -= set(self.seed_lst)
+
        camis = {
            'cami_v1': {'function': cami_v1.run_cami, 'params': {'consens_threshold': consens_threshold}},
            'cami_v2_param1_tr': {'function': cami_v2.run_cami, 'params': {
@@ -328,11 +336,16 @@ class cami():
            }},
        }
        
+        # create integer codes for cami_versions (needed for predicted_by vertex property)
+
        for cami_method_name, cami_params in camis.items():
            print("Running " + cami_method_name)
+            tool_code = max(list(tool_name_map.keys())) + 1
+            tool_name_map[tool_code] = cami_method_name
+            
            cami_vertices, putative_vertices, codes2tools = cami_params['function'](result_sets, ppi_graph, seed_list,
                                                                                    predicted_by, cami_scores,
-                                                                                    tool_name_map,
+                                                                                    tool_name_map, tool_code,
                                                                                    cami_params['params'])

            # sort the resulting vertices according to their cami_score
@@ -342,23 +355,33 @@ class cami():
            # translate the resulting vertex() ids to the corresponding names in the ppi network
            cami_genes = [self.ppi_vertex2gene[cami_vertex] for cami_vertex in cami_vlist]

+            if self.debug:
                print(f'With the given seed genes: {seed_genes} \n' +
                    f'CAMI ({cami_method_name}) proposes the following genes to add to the Active Module (sorted by CAMI Score):')
                for vertex in cami_vlist:
                    print(f'{gene_name_map[vertex]}\t{cami_scores[vertex]}\t{codes2tools[vertex]}')
-            # for visualization
+            else:
+                print(f'With the {len(seed_genes)} seed genes CAMI ({cami_method_name}) proposes {len(seed_genes)} to add to the Active Module')
+                
+            # for visualization with nvenn
            self.result_gene_sets[cami_method_name] = cami_genes
                
-            if cami_method_name == 'cami_v1':
-                # for drugstone
-                self.cami_vertices = cami_vlist
+            # transform all vertex indices to their corresponding gene names in a result set
+            for tool in result_sets:
+                self.result_gene_sets[tool.name] = set([gene_name_map[vertex] for vertex in result_sets[tool]])
+                
+            # add seeds to result sets for drugstone and digest
+            for tool in result_sets:
+                self.result_module_sets[tool.name] = set([gene_name_map[vertex] for vertex in result_sets[tool]]).union(self.seed_lst)
+            
+            assert(self.code2toolname == tool_name_map)
            
            # save the results in outputfiles
            self.generate_output(cami_method_name, seed_genes, cami_vlist, cami_vertices, putative_vertices, cami_genes,
-                                 gene_name_map, codes2tools, result_sets, cami_scores)
+                                 gene_name_map, codes2tools, cami_scores)

    def generate_output(self, cami_method, seed_genes, cami_vlist, cami_vertices, putative_vertices, cami_genes,
-                            gene_name_map, codes2tools, result_sets, cami_scores):
+                        gene_name_map, codes2tools, cami_scores):
        # save all predictions by all tools
        print('Saving the results...')
        with open(f'{self.output_dir}/all_predictions_{self.uid}.tsv', 'w') as outputfile:
@@ -401,9 +424,6 @@ class cami():
        print(f'saved cami output in: {self.output_dir}/CAMI_output_{self.uid}.tsv')
        print(f'saved the Consensus Active Module by CAMI in: {self.output_dir}/CAMI_nodes_{cami_method}_{self.uid}.txt')
        
-        # transform all vertex indices to their corresponding gene names in a result set
-        for tool in result_sets:
-            self.result_gene_sets[tool.name] = set([gene_name_map[vertex] for vertex in result_sets[tool]])
       
        # save predictions by the other tools
        for tool in self.result_gene_sets:
@@ -413,9 +433,6 @@ class cami():
                    outputfile.write(f'{gene}\n')
            print(f'saved {tool} output in: {self.output_dir}/{tool}_output_{self.uid}.tsv')
                    
-        # for drugstone
-        self.cami_vertices = cami_vlist
-        
        # return values
        consensus = {}
        consensus['module'] = whole_module
@@ -456,7 +473,7 @@ class cami():

    def use_drugstone(self):
        symbol = self.ppi_graph.vertex_properties["symbol"]
-        cami_module = self.cami_vertices + self.seed_lst
+        cami_module = self.cami_module
        cami_symbols = [symbol[vertex] for vertex in cami_module]
        cami_symbols.append
        cami_symbol_edges = []

--- a/cami_src/consensus/cami_v1.py
+++ b/cami_src/consensus/cami_v1.py
 import sys, os

-def run_cami(result_sets, ppi_graph, seed_lst, predicted_by, cami_scores, code2toolname, params):
+def run_cami(result_sets, ppi_graph, seed_lst, predicted_by, cami_scores, code2toolname, tool_code, params):
    consens_threshold = params['consens_threshold']
    # calculate gene weights
    # set of all result genes
@@ -10,8 +10,8 @@ def run_cami(result_sets, ppi_graph, seed_lst, predicted_by, cami_scores, code2t

    # parse every result set of each tool
    for tool in result_sets:
-        print(f'{tool.name}: {tool.weight}')
-        result_sets[tool] -= set(seed_lst)
+        #print(f'{tool.name}: {tool.weight}')
+
        # TODO: Should we keep the seeds in the result sets?
        # everytime a tool predicted a gene add 1 * the tool weight to its weight and add it to the result genes
        for vertex in result_sets[tool]:
@@ -22,7 +22,7 @@ def run_cami(result_sets, ppi_graph, seed_lst, predicted_by, cami_scores, code2t
                vertex] >= consens_threshold:  # if a vertex was predicted twice (or once if there is only 1 tool used) add it to the cami set
                putative_vertices.remove(vertex)
                cami_vertices.add(vertex)
-                predicted_by[vertex][0] = 1
+                predicted_by[vertex][tool_code] = 1

    # TODO: Find alternate ways to calculate CAMI scores => The heavy weights should get +0.5 too?
    # TODO: Try to rerun cami with varying input seeds?
@@ -36,7 +36,7 @@ def run_cami(result_sets, ppi_graph, seed_lst, predicted_by, cami_scores, code2t
        for vertex in putative_vertices:
            if vertex in neighbors:  # if a vertex is in the neighborhood of the heavy vertices increase the cami_score
                cami_vertices.add(vertex)
-                predicted_by[vertex][0] = 1
+                predicted_by[vertex][tool_code] = 1
                cami_scores[vertex] += 0.5


@@ -45,4 +45,4 @@ def run_cami(result_sets, ppi_graph, seed_lst, predicted_by, cami_scores, code2t
    codes2tools = {vertex: [code2toolname[idx] for idx, code in enumerate(predicted_by[vertex]) if code == 1] for
                   vertex in ppi_graph.vertices()}

-    return cami_vertices.union(set(seed_lst)), putative_vertices, codes2tools
\ No newline at end of file
+    return cami_vertices, putative_vertices, codes2tools
\ No newline at end of file
--- a/cami_src/consensus/cami_v2.py
+++ b/cami_src/consensus/cami_v2.py
@@ -7,7 +7,7 @@ from utils.networks import trustrank, betweenness, must

 # This uses a trustrank algorithm to rank all putative nodes starting from the seeds and only accepts the top 0.X entries
 # TODO maybe find a smart way to cutoff automatically?
-def run_cami(result_sets, ppi_graph, seed_lst, predicted_by, cami_scores, code2toolname, params):
+def run_cami(result_sets, ppi_graph, seed_lst, predicted_by, cami_scores, code2toolname, tool_code, params):
    damping_factor = params['damping_factor']
    hub_penalty = params['hub_penalty']
    confidence_level = params.get('confidence_level',0.5)
@@ -23,7 +23,6 @@ def run_cami(result_sets, ppi_graph, seed_lst, predicted_by, cami_scores, code2t
    # parse every result set of each tool
    counts = defaultdict(lambda: 0)
    for tool in result_sets:
-        result_sets[tool] -= set(seed_lst)
        for vertex in result_sets[tool]:
            putative_vertices.add(vertex)
            counts[vertex] = counts[vertex] + tool.weight
@@ -49,8 +48,9 @@ def run_cami(result_sets, ppi_graph, seed_lst, predicted_by, cami_scores, code2t
    for v in putative_vertices:
        if scores.a[int(v)] >= threshold and scores.a[int(v)] > 0:
            cami_vertices.add(v)
+            predicted_by[v][tool_code] = 1

    # translate tool code to string
    codes2tools = {vertex: [code2toolname[idx] for idx, code in enumerate(predicted_by[vertex]) if code == 1] for
                   vertex in ppi_graph.vertices()}
-    return cami_vertices.union(set(seed_lst)), putative_vertices, codes2tools
+    return cami_vertices, putative_vertices, codes2tools
--- a/cami_src/consensus/cami_v3.py
+++ b/cami_src/consensus/cami_v3.py
@@ -5,10 +5,10 @@ import graph_tool as gt

 # This uses a trustrank algorithm to rank all putative nodes starting from the seeds and only accepts the top 0.X entries
 # TODO maybe find a smart way to cutoff automatically?
-def run_cami(result_sets, ppi_graph, seed_lst, predicted_by, cami_scores, code2toolname, params):
+def run_cami(result_sets, ppi_graph, seed_lst, predicted_by, cami_scores, code2toolname, tool_code, params):
    damping_factor = params['damping_factor']
    hub_penalty = params['hub_penalty']
-    confidence_levelentage = params['confidence_level']
+    confidence_level = params['confidence_level']
    weighted = 'weighted' in params and params['weighted']
    ranking_method = params['ranking'] if 'ranking' in params else 'trustrank'
    trees = params.get('trees',5)
@@ -22,7 +22,6 @@ def run_cami(result_sets, ppi_graph, seed_lst, predicted_by, cami_scores, code2t
    # parse every result set of each tool
    counts = defaultdict(lambda: 0)
    for tool in result_sets:
-        result_sets[tool] -= set(seed_lst)
        for vertex in result_sets[tool]:
            putative_vertices.add(vertex)
            counts[vertex] = counts[vertex] + tool.weight
@@ -55,12 +54,13 @@ def run_cami(result_sets, ppi_graph, seed_lst, predicted_by, cami_scores, code2t
                pass
    putative_scores = list(putative_score_map.values())
    putative_scores.sort()
-    threshold = putative_scores[int(len(putative_vertices) * (1 - confidence_levelentage))]
+    threshold = putative_scores[int(len(putative_vertices) * (1 -  confidence_level))]
    for v in putative_vertices:
        if putative_score_map[v] >= threshold and putative_score_map[v] > 0:
            cami_vertices.add(v)
+            predicted_by[v][tool_code] = 1

    # translate tool code to string
    codes2tools = {vertex: [code2toolname[idx] for idx, code in enumerate(predicted_by[vertex]) if code == 1] for
                   vertex in ppi_graph.vertices()}
-    return cami_vertices.union(set(seed_lst)), putative_vertices, codes2tools
+    return cami_vertices, putative_vertices, codes2tools
--- a/cami_src/preprocess.py
+++ b/cami_src/preprocess.py
@@ -33,9 +33,8 @@ def csv2graph(inputfile,
                unseen_vertices -= 1
            if unseen_vertices == 0:
                break
-    g.vertex_properties["betweenness"], g.edge_properties["betweenness"] = graph_tool.centrality.betweenness(g)
    g.vertex_properties["cami_score"] = g.new_vertex_property("float", val=0.0)
-    values = (20) * [-1]
+    values = (50) * [-1]
    g.vertex_properties["predicted_by"] = g.new_vertex_property("vector<int16_t>", val=values)
    return g