From 2692374c3fcaa4a23065dd1f9301076dc9d633a9 Mon Sep 17 00:00:00 2001
From: "Hartung, Michael" <michael.hartung@uni-hamburg.de>
Date: Fri, 21 Apr 2023 19:16:51 +0200
Subject: [PATCH] map_nodes: remove empty node ids; compact_ids: fallack to
 input id if protein does not have id in target space

Former-commit-id: 712b553579c8857e63d895df715677919b31cec2 [formerly 320590f9cd52ce892d309763aca62ee26b8305b1]
Former-commit-id: b1ab54bc90f5c43527fcf914183c0cbd89350c36
---
 drugstone/util/query_db.py | 27 ++++++++++++++++++++++-----
 drugstone/views.py         |  7 +++++++
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/drugstone/util/query_db.py b/drugstone/util/query_db.py
index a589cf2..8ccd9e8 100644
--- a/drugstone/util/query_db.py
+++ b/drugstone/util/query_db.py
@@ -7,6 +7,16 @@ from drugstone.models import Protein, EnsemblGene
 from drugstone.serializers import ProteinSerializer
 
 
+MAP_ID_SPACE_COMPACT_TO_DRUGSTONE = {
+    'symbol:': 'symbol',
+    'uniprot:': 'uniprot',
+    'ensg:': 'ensg',
+    'ncbigene:': 'entrez',
+    'ensembl:': 'ensg',
+    'entrez:': 'entrez'
+}
+
+
 def query_proteins_by_identifier(node_ids: Set[str], identifier: str) -> Tuple[List[dict], str]:
     """Queries the django database Protein table given a list of identifiers (node_ids) and a identifier name
     (identifier).
@@ -66,13 +76,13 @@ def query_proteins_by_identifier(node_ids: Set[str], identifier: str) -> Tuple[L
 
 def get_protein_ids(id_space, proteins):
     if (id_space == 'uniprot'):
-        return [p['uniprot'] for p in proteins]
+        return {p['uniprot'] for p in proteins}
     if (id_space == 'ensg' or id_space == 'ensembl'):
-        return [p['ensg'] for p in proteins]
+        return {p['ensg'] for p in proteins}
     if (id_space == 'symbol'):
-        return [p['symbol'] for p in proteins]
+        return {p['symbol'] for p in proteins}
     if (id_space == 'entrez' or id_space == 'ncbigene'):
-        return [p['entrez'] for p in proteins]
+        return {p['entrez'] for p in proteins}
     return set()
 
 
@@ -136,7 +146,14 @@ def clean_proteins_from_compact_notation(node_ids: Set[str], identifier: str) ->
             continue
         q_list = reduce(lambda a, b: a | b, q_list)
         proteins = ProteinSerializer(many=True).to_representation(Protein.objects.filter(q_list))
-        clean_ids = clean_ids.union(get_protein_ids(identifier, proteins))
+        # if protein could not be mapped
+        clean_ids_temp = get_protein_ids(identifier, proteins)
+        if '' in clean_ids_temp:
+            clean_ids_temp.remove('')
+            # at least one protein could not be found in id space, use original id as placeholder
+            ids_placeholder = {p[MAP_ID_SPACE_COMPACT_TO_DRUGSTONE[id_space]] for p in proteins if p[identifier] == ''}
+            clean_ids_temp |= ids_placeholder
+        clean_ids |= clean_ids_temp
 
     return list(clean_ids)
 
diff --git a/drugstone/views.py b/drugstone/views.py
index 89081ab..366af9f 100755
--- a/drugstone/views.py
+++ b/drugstone/views.py
@@ -175,11 +175,18 @@ def map_nodes(request) -> Response:
     """
     # load data from request
     nodes = request.data.get('nodes', '[]')
+
     id_map = {}
+    nodes_clean = []
     for node in nodes:
+        if not node['id']:
+            # skip empty node id ''
+            continue
         upper = node['id'].upper()
         id_map[upper] = node['id']
         node['id'] = upper
+        nodes_clean.append(node)
+    nodes = nodes_clean
 
     identifier = request.data.get('identifier', '')
     # extract ids for filtering
-- 
GitLab