From dc07f19540c31b7269de93acc57e0a7562667634 Mon Sep 17 00:00:00 2001
From: AndiMajore <andi.majore@googlemail.com>
Date: Thu, 21 Jul 2022 21:01:44 +0200
Subject: [PATCH] changed to new node schema

---
 drugstone/serializers.py   |  35 ++++++++++
 drugstone/tasks.py         |   2 +-
 drugstone/util/query_db.py |  22 ++++---
 drugstone/views.py         | 127 +++++++++++++++++++++++--------------
 4 files changed, 129 insertions(+), 57 deletions(-)

diff --git a/drugstone/serializers.py b/drugstone/serializers.py
index 666943d..1e92af8 100755
--- a/drugstone/serializers.py
+++ b/drugstone/serializers.py
@@ -17,6 +17,41 @@ class PPIDatasetSerializer(serializers.ModelSerializer):
         model = models.PPIDataset
         fields = '__all__'
 
+class ProteinNodeSerializer(serializers.ModelSerializer):
+    drugstone_id = serializers.SerializerMethodField()
+    uniprot_ac = serializers.SerializerMethodField()
+    symbol = serializers.SerializerMethodField()
+    ensg = serializers.SerializerMethodField()
+    entrez = serializers.SerializerMethodField()
+
+    def get_drugstone_id(self, obj):
+        return [f'p{obj.id}']
+
+    def get_uniprot_ac(self, obj):
+        return [obj.uniprot_code]
+
+    def get_symbol(self, obj):
+        return [obj.gene]
+
+    def get_entrez(self,obj):
+        return [obj.entrez]
+
+    def get_ensg(self, obj) -> str:
+        """Since ENSG has a many to one relationship to the Protein table,
+        return a list of all matching ensg names.
+
+        Args:
+            obj (Protein): Protein object
+
+        Returns:
+            str: list of all matching ENSG numbers
+        """
+        return [x.name for x in obj.ensg.all()]
+
+    class Meta:
+        model = Protein
+        fields = ['drugstone_id', 'uniprot_ac', 'symbol', 'protein_name', 'entrez', 'ensg']
+
 
 class ProteinSerializer(serializers.ModelSerializer):
     drugstone_id = serializers.SerializerMethodField()
diff --git a/drugstone/tasks.py b/drugstone/tasks.py
index 97cc9f3..fd09e65 100644
--- a/drugstone/tasks.py
+++ b/drugstone/tasks.py
@@ -19,7 +19,7 @@ def task_update_db_from_nedrex():
     if n > 0:
         logger.info('Recreating networks...')
         proc = subprocess.Popen(['python3', '/usr/src/drugstone/manage.py', 'make_graphs'])
-        out,err = proc.communicate()
+        out, err = proc.communicate()
         print(out)
         print(err)
     logger.info('Done.')
diff --git a/drugstone/util/query_db.py b/drugstone/util/query_db.py
index 89fd8be..ba26adb 100644
--- a/drugstone/util/query_db.py
+++ b/drugstone/util/query_db.py
@@ -1,3 +1,4 @@
+import copy
 from collections import defaultdict
 from typing import List, Tuple, Set, OrderedDict
 from functools import reduce
@@ -22,7 +23,6 @@ def query_proteins_by_identifier(node_ids: Set[str], identifier: str) -> Tuple[L
             Returns list of serialized protein entries for all matched IDs
             Returns name of backend attribute of Protein table
     """
-
     # query protein table
     if identifier == 'symbol':
         protein_attribute = 'symbol'
@@ -32,9 +32,9 @@ def query_proteins_by_identifier(node_ids: Set[str], identifier: str) -> Tuple[L
         q_list = map(lambda n: Q(uniprot_code__iexact=n), node_ids)
     elif identifier == 'ensg':
         protein_attribute = 'ensg'
-        node_ids = map(lambda n: n.protein_id, EnsemblGene.objects.filter(
+        dr_ids = map(lambda n: n.protein_id, EnsemblGene.objects.filter(
             reduce(lambda a, b: a | b, map(lambda n: Q(name__iexact=n), list(node_ids)))))
-        q_list = map(lambda n: Q(id=n), node_ids)
+        q_list = map(lambda n: Q(id=n), dr_ids)
     elif identifier == 'entrez':
         protein_attribute = 'entrez'
         q_list = map(lambda n: Q(entrez=n), node_ids)
@@ -45,11 +45,17 @@ def query_proteins_by_identifier(node_ids: Set[str], identifier: str) -> Tuple[L
     node_objects = Protein.objects.filter(q_list)
 
     nodes = list()
-
     node_map = defaultdict(list)
-
-    for node in ProteinSerializer(many=True).to_representation(node_objects):
-        node_map[node.get(protein_attribute)].append(node)
+    if identifier == 'ensg':
+        for node in ProteinSerializer(many=True).to_representation(node_objects):
+            for ensembl_id in node.get(protein_attribute):
+                if ensembl_id.upper() in node_ids:
+                    node = copy.copy(node)
+                    node[identifier] = ensembl_id
+                    node_map[ensembl_id].append(node)
+    else:
+        for node in ProteinSerializer(many=True).to_representation(node_objects):
+            node_map[node.get(protein_attribute)].append(node)
     for node_id, entries in node_map.items():
         nodes.append(aggregate_nodes(entries))
 
@@ -60,7 +66,7 @@ def aggregate_nodes(nodes: List[OrderedDict]):
     node = defaultdict(set)
     for n in nodes:
         for key, value in n.items():
-            if isinstance(value,list):
+            if isinstance(value, list):
                 for e in value:
                     node[key].add(e)
             else:
diff --git a/drugstone/views.py b/drugstone/views.py
index 038fa88..d923139 100755
--- a/drugstone/views.py
+++ b/drugstone/views.py
@@ -4,6 +4,8 @@ import random
 import string
 import time
 import uuid
+from collections import defaultdict
+
 import pandas as pd
 from typing import Tuple
 
@@ -58,12 +60,12 @@ def get_pdis_ds(source, licenced):
 
 def get_drdis_ds(source, licenced):
     try:
-        ds = models.PDisDataset.objects.filter(name__iexact=source, licenced=licenced).last()
+        ds = models.DrDiDataset.objects.filter(name__iexact=source, licenced=licenced).last()
         ds.id
         return ds
     except:
         if licenced:
-            return get_pdis_ds(source, False)
+            return get_drdis_ds(source, False)
         return None
 
 
@@ -180,6 +182,7 @@ def map_nodes(request) -> Response:
     #     nodes_mapped_dict = {node_id: node for node in nodes_mapped for node_id in node[id_key]}
     # else:
     nodes_mapped_dict = {node[id_key][0]: node for node in nodes_mapped}
+
     # merge fetched data with given data to avoid data loss
     for node in nodes:
         node['drugstoneType'] = 'other'
@@ -257,74 +260,49 @@ def result_view(request) -> Response:
     if not node_attributes:
         node_attributes = {}
         result['node_attributes'] = node_attributes
+
     proteins = []
     drugs = []
 
     network = result['network']
-    node_types = node_attributes.get('node_types')
-    if not node_types:
-        node_types = {}
-        node_attributes['node_types'] = node_types
-    is_seed = node_attributes.get('is_seed')
-    if not is_seed:
-        is_seed = {}
-        node_attributes['is_seed'] = is_seed
+    node_types = {}
+    node_attributes['node_types'] = node_types
+    is_seed = {}
+    node_attributes['is_seed'] = is_seed
     scores = node_attributes.get('scores', {})
     node_details = {}
+    protein_id_map = defaultdict(set)
     node_attributes['details'] = node_details
     parameters = json.loads(task.parameters)
     seeds = parameters['seeds']
     nodes = network['nodes']
-    # edges = network['edges']
-    for node_id in nodes:
-        is_seed[node_id] = node_id in seeds
-        node_type = node_types.get(node_id).lower()
-        pvd_entity = None
-        details_s = None
-        if node_type == 'protein':
-            pvd_entity = Protein.objects.get(id=int(node_id[1:]))
-        elif node_type == 'drug':
-            pvd_entity = Drug.objects.get(id=int(node_id[2:]))
-
-        if not node_type or not pvd_entity:
-            continue
-        if node_type == 'protein':
-            details_s = ProteinSerializer().to_representation(pvd_entity)
-        elif node_type == 'drug':
-            details_s = DrugSerializer().to_representation(pvd_entity)
-        node_types[node_id] = node_type
-
-        if scores.get(node_id) is not None:
-            details_s['score'] = scores.get(node_id, None)
-        node_details[node_id] = details_s
-        if node_type == 'protein':
-            proteins.append(details_s)
-        elif node_type == 'drug':
-            drugs.append(details_s)
+
     parameters = task_parameters(task)
     # attach input parameters to output
     result['parameters'] = parameters
+    identifier_nodes = set()
+    identifier = parameters['config']['identifier']
 
-    # TODO move the merging to "scores to result"
     # merge input network with result network
     for node in parameters['input_network']['nodes']:
         # if node was already mapped, add user defined values to result of analysis
-        if node_name_attribute in node:
-            if node[node_name_attribute] in node_details:
+        if identifier in identifier_nodes:
+            node_name = node[identifier][0]
+            if node_name in node_details:
                 # update the node to not lose user input attributes
-                node_details[node[node_name_attribute]].update(node)
+                node_details[node_name].update(node)
                 # skip adding node if node already exists in analysis output to avoid duplicates
             else:
                 # node does not exist in analysis output yet, was added by user but not used as seed
-                node_details[node[node_name_attribute]] = node
+                node_details[node_name] = node
                 # append mapped input node to analysis result
-                nodes.append(node[node_name_attribute])
+                nodes.append(node_name)
                 # manually add node to node types
-                result['node_attributes']['node_types'][node[node_name_attribute]] = 'protein'
+                result['node_attributes']['node_types'][node_name] = 'protein'
         else:
             # node is custom node from user, not mapped to drugstone but will be displayed with all custom attributes
             node_id = node['id']
-            nodes.append(node_id)
+            identifier_nodes.add(node_id)
             node_details[node_id] = node
             is_seed[node_id] = False
             # append custom node to analysis result later on
@@ -332,15 +310,62 @@ def result_view(request) -> Response:
             result['node_attributes']['node_types'][node_id] = 'custom'
     # extend the analysis network by the input netword nodes
     # map edge endpoints to database proteins if possible and add edges to analysis network
-    identifier = parameters['config']['identifier']
+
+    # mapping all new protein and drug nodes by drugstoneIDs + adding scores
+    for node_id in nodes:
+
+        if node_id[0] == 'p':
+            node_data = ProteinNodeSerializer().to_representation(Protein.objects.get(id=int(node_id[1:])))
+            # proteins.append(node_data)
+            node_ident = node_data[identifier][0]
+            # node_data[identifier] = [node_ident]
+            protein_id_map[node_ident].add(node_id)
+            identifier_nodes.add(node_ident)
+            is_seed[node_ident] = node_id in seeds or (is_seed[node_ident] if node_ident in is_seed else False)
+            node_types[node_ident] = 'protein'
+            score = scores.get(node_id, None)
+            if node_ident in node_details:
+                data = node_details[node_ident]
+                data['entrez'].extend(node_data['entrez'])
+                data['ensg'].extend(node_data['ensg'])
+                data['symbol'].extend(node_data['symbol'])
+                data['uniprot_ac'].extend(node_data['uniprot_ac'])
+                if score:
+                    if 'score' in data:
+                        data['score'].append(score)
+                    else:
+                        data['score'] = [score] if score else []
+            else:
+                node_data['score'] = [score] if score else []
+                node_data['drugstoneType'] = 'protein'
+                node_data['id'] = node_ident
+                node_data['label'] = node_ident
+                node_details[node_ident] = node_data
+
+        elif node_id[:2] == 'dr':
+            node_data = DrugSerializer().to_representation(Drug.objects.get(id=int(node_id[2:])))
+            drugs.append(node_data)
+            if node_id in scores:
+                node_data['score'] = scores.get(node_id, None)
+            node_types[node_id] = 'drug'
+            node_details[node_id] = node_data
+        else:
+            continue
+    for node_id, detail in node_details.items():
+        detail['symbol'] = list(set(detail['symbol']))
+        detail['entrez'] = list(set(detail['entrez']))
+        detail['uniprot_ac'] = list(set(detail['uniprot_ac']))
+        detail['ensg'] = list(set(detail['ensg']))
+
     edges = parameters['input_network']['edges']
     edge_endpoint_ids = set()
+    # TODO check for custom edges when working again
     for edge in edges:
         edge_endpoint_ids.add(edge['from'])
         edge_endpoint_ids.add(edge['to'])
 
-    # query protein table
     nodes_mapped, id_key = query_proteins_by_identifier(edge_endpoint_ids, identifier)
+
     # change data structure to dict in order to be quicker when merging
     nodes_mapped_dict = {node[id_key]: node for node in nodes_mapped}
     for edge in edges:
@@ -350,8 +375,10 @@ def result_view(request) -> Response:
         edge['to'] = nodes_mapped_dict[edge['to']][node_name_attribute] if edge['to'] in nodes_mapped_dict else edge[
             'to']
     if 'autofill_edges' in parameters['config'] and parameters['config']['autofill_edges']:
-        proteins = set(map(lambda n: n[node_name_attribute][1:],
-                           filter(lambda n: node_name_attribute in n, parameters['input_network']['nodes'])))
+        proteins = {node_name[1:] for nodes in map(lambda n: n[node_name_attribute],
+                                                   filter(lambda n: node_name_attribute in n,
+                                                          parameters['input_network']['nodes'])) for node_name in nodes}
+
         dataset = DEFAULTS['ppi'] if 'interaction_protein_protein' not in parameters['config'] else \
             parameters['config'][
                 'interaction_protein_protein']
@@ -362,6 +389,9 @@ def result_view(request) -> Response:
             map(lambda n: {"from": f'p{n.from_protein_id}', "to": f'p{n.to_protein_id}'}, interaction_objects))
         edges.extend(auto_edges)
     result['network']['edges'].extend(edges)
+    result['network']['nodes'] = list(identifier_nodes)
+    if 'scores' in result['node_attributes']:
+        del result['node_attributes']['scores']
 
     if not view:
         return Response(result)
@@ -375,6 +405,7 @@ def result_view(request) -> Response:
                         'gene': i['symbol'],
                         'name': i['protein_name'],
                         'ensg': i['ensg'],
+                        'entrez': i['entrez'],
                         'seed': is_seed[i[node_name_attribute]],
                     }
                     if i.get('score'):
-- 
GitLab