cleaned up the code. handled saving of the outputfiles.

026eda27 · Mia_Le · 33aaf45c · 026eda27 · 026eda27 · 026eda27
Commit 026eda27 authored Feb 5, 2022 by Mia_Le
--- a/cami/DiamondWrapper.py
+++ b/cami/DiamondWrapper.py
 from AlgorithmWrapper import AlgorithmWrapper
-from graph_tool import Graph 
-import subprocess, os, preprocess
-import tempfile
+import subprocess, os

 class DiamondWrapper(AlgorithmWrapper):
    def __init__(self):
        super().__init__()
        self.name = 'DIAMOnD'
+        self.code = 1

    def run_algorithm(self, inputparams):
        """Run DIAMOnD algorithm

--- a/cami/DominoWrapper.py
+++ b/cami/DominoWrapper.py
@@ -7,6 +7,7 @@ class DominoWrapper(AlgorithmWrapper):
    def __init__(self):
        super().__init__()
        self.name = 'DOMINO'
+        self.code = 2

    def run_algorithm(self, inputparams):
        """Run Domino algorithm

--- a/cami/RobustWrapper.py
+++ b/cami/RobustWrapper.py
@@ -16,6 +16,7 @@ class RobustWrapper(AlgorithmWrapper):
    def __init__(self):
        super().__init__()
        self.name = 'ROBUST'
+        self.code = 3

    def run_algorithm(self, inputparams):
        # -----------------------------------------------------

--- a/cami/cami_suite.py
+++ b/cami/cami_suite.py
@@ -38,9 +38,10 @@ class cami():
        self.output_dir = output_dir
        self.tmp_dir = tmp_dir
        self.uid = uid
-        self.visualize = False
        self.nof_tools = len(tool_wrappers)
-
+        self.result_gene_sets = {} #contains the genes predicted by the tools (not the indices)
+        self.code2toolname = {tool.code:tool.name for tool in self.tool_wrappers}
+        self.code2toolname[0] = 'CAMI'
    def run_tool(self, tool):
        tool.set_id(self.uid)
        # TODO: Rethink placement of creation of the temporary directory?
@@ -49,7 +50,7 @@ class cami():
        inputparams = tool.prepare_input()
        print(f'running {tool.name}...')
        preds = set(tool.run_algorithm(inputparams))
-        print(f'{tool.name} predicted {len(preds)} active genes (seed genes not excluded):')
+        print(f'{tool.name} predicted {len(preds)} active vertices (seeds not excluded):')
        print(preds)
        return preds

@@ -57,13 +58,13 @@ class cami():
        print('Evaluation not implemented yet.')

    def create_consensus(self):
-        pred_sets = {}

        print(f'creating result sets of all {self.nof_tools} tools...')
-
+        pred_sets = {}
        for tool in self.tool_wrappers:
            preds = self.run_tool(tool)
            pred_sets[tool] = preds #- seed_set
+        print(pred_sets)

        assert self.nof_tools == len(pred_sets), 'Number of used tools does not match with number of result sets'
        
@@ -72,85 +73,119 @@ class cami():

        # calculate gene weights
        # set of all result genes 
-        gene_weights = self.ppi_graph.vertex_properties["weight"]
+        cami_scores = self.ppi_graph.vertex_properties["cami_score"]
+        predicted_by = self.ppi_graph.vertex_properties["predicted_by"]
        cami_vertices = set()
        putative_vertices = set()
-        
+        consens_threshold = min(self.nof_tools, 2)
        for tool in result_sets:
            result_sets[tool] -= set(self.seed_lst)
+            # TODO: Should we keep the seeds in the result sets?
            # everytime a tool predicted a gene add 1 * the tool weight to its weight and add it to the result genes
            for vertex in result_sets[tool]:
-                gene_weights[vertex] += 1.0 * tool.weight
+                predicted_by[vertex][tool.code] = 1 #TODO: tool or tool.name?
+                cami_scores[vertex] += 1.0 * tool.weight
                putative_vertices.add(vertex)
-                if gene_weights[vertex] > 2: # if a vertex was predicted twice add it to the cami set
+                if cami_scores[vertex] >= consens_threshold: # if a vertex was predicted twice (or once if there is only 1 tool used) add it to the cami set
                    putative_vertices.remove(vertex)
                    cami_vertices.add(vertex)
+                    predicted_by[vertex][0] = 1
                                    
        # TODO: add neighbors of the result genes if gene weight == 1
        # TODO: calculate CAMI scores
        # TODO: Try to rerun cami with varying input seeds
-        # add a putative gene to the cami set when it is in the neighborhood of the existing cami genes
-        for vertex in cami_vertices.copy():
+        # add a putative gene to the cami set when it is in the neighborhood of the existing cami genes or the seed genes
+        heavy_vertices = cami_vertices.copy()
+        for seed in self.seed_lst:
+            heavy_vertices.add(seed)
+
+        for vertex in heavy_vertices:
            neighbors = vertex.all_neighbors()
            for vertex in putative_vertices:
                if vertex in neighbors:
                    cami_vertices.add(vertex)
-                    gene_weights[vertex] += 0.5
+                    predicted_by[vertex][0] = 1
+                    cami_scores[vertex] += 0.5
+        # sort the resulting vertices according to their cami_score
+        cami_vlist = sorted(cami_vertices, key=lambda v:cami_scores[v], reverse=True)
+
+        cami_genes = [self.ppi_vertex2gene[cami_vertex] for cami_vertex in cami_vlist]
+        seed_genes = [self.ppi_vertex2gene[seed_vertex] for seed_vertex in self.seed_lst]
+        
+        print(f'With the given seed genes: {seed_genes} CAMI proposes the following genes to add to the Active Module (sorted by CAMI Score): {cami_genes}')

-        cami_genes = [self.ppi_vertex2gene[cami_vertex] for cami_vertex in cami_vertices]
-        cami_genes = sorted(cami_genes, key=lambda v:cami_genes[v])
+        # save the results in outputfiles
+        name = self.ppi_vertex2gene
        
+        with open(f'{self.output_dir}/all_predictions_{self.uid}.tsv', 'w') as outputfile:
+            outputfile.write(f'gene\tpredicted_by\tcami_score\tindex_in_graph\n')
+            all_vertices = cami_vertices.union(putative_vertices)
+            for vertex in all_vertices:
+                outputfile.write(f'{name[vertex]}\t{[self.code2toolname[idx] for idx,code in enumerate(predicted_by[vertex]) if code == 1]}\t{cami_scores[vertex]}\t{str(vertex)}\n')

-        print(f'With the given seed genes: {self.seed_lst} CAMI proposes the following genes to add to the Active Module (sorted by CAMI Score): {cami_result}')
+        with open(f'{self.output_dir}/CAMI_{self.uid}_output.tsv', 'w') as outputfile:
+            outputfile.write('gene\tindex_in_graph\tcami_score\n')
+            for vertex in cami_vlist:
+                outputfile.write(f'{name[vertex]}\t{str(vertex)}\t{cami_scores[vertex]}\n')

-        #make intersections
-        intersection_set = {}
+        # for visualization
+        # transform the vertex indices to their corresponding gene names
+        self.result_gene_sets['CAMI'] = cami_genes
        for tool in result_sets:
-            intersection_set[tool.name] = result_sets[tool]
-        intersection_set['Seeds'] = set(self.seed_lst)
-        tools = [tool for tool in intersection_set]
-        combis = list_combinations([True,False], len(tools))
-        result_table = {}
-        for combi in combis:
-            key = ''
-            value = set()
-            for toolid,choice in enumerate(combi):
-                if choice:
-                    tool = tools[toolid]
-                    key += tool + '_'
-                    if len(value) == 0:
-                        value = intersection_set[tool]
-                    else:
-                        value = value.intersection(intersection_set[tool])
-            if len(key) > 0 and len(value) > 0:
-                result_table[key[:-1]] = value          
-        # save intersections in tabular format
-        intersectionsfilename = f'cami_intersections_{self.uid}_output.txt'
-        intersectionsfile = f'{self.output_dir}/{intersectionsfilename}'
-        with open(intersectionsfile, 'w') as file:
-            file.write('Result sets and intersections (combined by _) of all used tools and the used seed genes.\n')
-            file.write('All seed genes were removed from the result sets of the tools\n')
-            for group in result_table:
-                file.write(f'{group}: {result_table[group]}\n')
-            print(f'Intersections and result sets of all tools were saved to {intersectionsfile}')
-    
-    def visualize(self, degradome_sets):
+            self.result_gene_sets[tool.name] = set([name[vertex] for vertex in result_sets[tool]])
+
+    def visualize(self):
        # visualize with degradome
-        if self.nof_tools < 5:
-            print('Visualizing results using Degradome...')
-            print(f'creating a separate set for the seeds...')
-            # degradome_sets['CAMI'] = set(result_genes)
-            degradome_sets['Seeds'] = set(self.seed_lst)
-            url = degradome.send_request(degradome_sets)
-            webbrowser.open(url)
-        elif nof_tools == 6:
-            print('Visualizing using Degradome...(seeds excluded from results)')
-            # degradome_sets = result_sets.copy()
+        if self.nof_tools < 7:
+            # print('Visualizing results using Degradome...')
+            # print(f'creating a separate set for the seeds...')
            # degradome_sets['CAMI'] = set(result_genes)
-            url = degradome.send_request(degradome_sets)
+            # degradome_sets['Seeds'] = set(self.seed_lst)
+            url = degradome.send_request(self.result_gene_sets)
            webbrowser.open(url)
+        # elif nof_tools == 6:
+        #     print('Visualizing using Degradome...(seeds excluded from results)')
+        #     # degradome_sets = result_sets.copy()
+        #     # degradome_sets['CAMI'] = set(result_genes)
+        #     url = degradome.send_request(degradome_sets)
+        #     webbrowser.open(url)
        else:
            print('Cannot use degradome to create venn diagrams of 6 or more tools')
        # for node in union:


+
+    # do we even need the intersections?
+    # def intersect(self)
+    #     #make intersections
+    #     intersection_set = {}
+    #     for tool in result_sets:
+    #         intersection_set[tool.name] = result_sets[tool]
+    #     intersection_set['Seeds'] = set(self.seed_lst)
+    #     tools = [tool for tool in intersection_set]
+    #     combis = list_combinations([True,False], len(tools))
+    #     result_table = {}
+    #     for combi in combis:
+    #         key = ''
+    #         value = set()
+    #         for toolid,choice in enumerate(combi):
+    #             if choice:
+    #                 tool = tools[toolid]
+    #                 key += tool + '_'
+    #                 if len(value) == 0:
+    #                     value = intersection_set[tool]
+    #                 else:
+    #                     value = value.intersection(intersection_set[tool])
+    #         if len(key) > 0 and len(value) > 0:
+    #             result_table[key[:-1]] = value          
+    #     # save intersections in tabular format
+    #     intersectionsfilename = f'cami_intersections_{self.uid}_output.txt'
+    #     intersectionsfile = f'{self.output_dir}/{intersectionsfilename}'
+    #     with open(intersectionsfile, 'w') as file:
+    #         file.write('Result sets and intersections (combined by _) of all used tools and the used seed genes.\n')
+    #         file.write('All seed genes were removed from the result sets of the tools\n')
+    #         for group in result_table:
+    #             file.write(f'{group}: {result_table[group]}\n')
+    #         print(f'Intersections and result sets of all tools were saved to {intersectionsfile}')
+    
+
--- a/cami/preprocess.py
+++ b/cami/preprocess.py
@@ -13,7 +13,8 @@ def csv2graph(inputfile, delimiter="\t"):
    """
    g = graph_tool.load_graph_from_csv(inputfile, skip_first=True,
        csv_options={'delimiter': '\t', 'quotechar': '"'})
-    g.vertex_properties["weight"] = g.new_vertex_property("float", val=0.0)
+    g.vertex_properties["cami_score"] = g.new_vertex_property("float", val=0.0)
+    g.vertex_properties["predicted_by"] = g.new_vertex_property("vector<int16_t>", val=[-1,-1,-1,-1,-1])
    return g

 def txt2lst(seed_file):