map_nodes: remove empty node ids; compact_ids: fallack to input id if protein...

map_nodes: remove empty node ids; compact_ids: fallack to input id if protein does not have id in target space Former-commit-id: 712b553579c8857e63d895df715677919b31cec2 [formerly 320590f9cd52ce892d309763aca62ee26b8305b1] Former-commit-id: b1ab54bc90f5c43527fcf914183c0cbd89350c36

map_nodes: remove empty node ids; compact_ids: fallack to input id if protein...
2692374c · Hartung, Michael · 8c3c38ea · 2692374c · 2692374c
Commit 2692374c authored 2 years ago by Hartung, Michael
--- a/drugstone/util/query_db.py
+++ b/drugstone/util/query_db.py
@@ -7,6 +7,16 @@ from drugstone.models import Protein, EnsemblGene
 from drugstone.serializers import ProteinSerializer
+MAP_ID_SPACE_COMPACT_TO_DRUGSTONE = {
+    'symbol:': 'symbol',
+    'uniprot:': 'uniprot',
+    'ensg:': 'ensg',
+    'ncbigene:': 'entrez',
+    'ensembl:': 'ensg',
+    'entrez:': 'entrez'
+}
 def query_proteins_by_identifier(node_ids: Set[str], identifier: str) -> Tuple[List[dict], str]:
    """Queries the django database Protein table given a list of identifiers (node_ids) and a identifier name
    (identifier).
@@ -66,13 +76,13 @@ def query_proteins_by_identifier(node_ids: Set[str], identifier: str) -> Tuple[L
 def get_protein_ids(id_space, proteins):
    if (id_space == 'uniprot'):
-        return [p['uniprot'] for p in proteins]
+        return {p['uniprot'] for p in proteins}
    if (id_space == 'ensg' or id_space == 'ensembl'):
-        return [p['ensg'] for p in proteins]
+        return {p['ensg'] for p in proteins}
    if (id_space == 'symbol'):
-        return [p['symbol'] for p in proteins]
+        return {p['symbol'] for p in proteins}
    if (id_space == 'entrez' or id_space == 'ncbigene'):
-        return [p['entrez'] for p in proteins]
+        return {p['entrez'] for p in proteins}
    return set()
@@ -136,7 +146,14 @@ def clean_proteins_from_compact_notation(node_ids: Set[str], identifier: str) ->
            continue
        q_list = reduce(lambda a, b: a | b, q_list)
        proteins = ProteinSerializer(many=True).to_representation(Protein.objects.filter(q_list))
-        clean_ids = clean_ids.union(get_protein_ids(identifier, proteins))
+        # if protein could not be mapped
+        clean_ids_temp = get_protein_ids(identifier, proteins)
+        if '' in clean_ids_temp:
+            clean_ids_temp.remove('')
+            # at least one protein could not be found in id space, use original id as placeholder
+            ids_placeholder = {p[MAP_ID_SPACE_COMPACT_TO_DRUGSTONE[id_space]] for p in proteins if p[identifier] == ''}
+            clean_ids_temp |= ids_placeholder
+        clean_ids |= clean_ids_temp
    return list(clean_ids)

--- a/drugstone/views.py
+++ b/drugstone/views.py
@@ -175,11 +175,18 @@ def map_nodes(request) -> Response:
    """
    # load data from request
    nodes = request.data.get('nodes', '[]')
    id_map = {}
+    nodes_clean = []
    for node in nodes:
+        if not node['id']:
+            # skip empty node id ''
+            continue
        upper = node['id'].upper()
        id_map[upper] = node['id']
        node['id'] = upper
+        nodes_clean.append(node)
+    nodes = nodes_clean
    identifier = request.data.get('identifier', '')
    # extract ids for filtering