From 2692374c3fcaa4a23065dd1f9301076dc9d633a9 Mon Sep 17 00:00:00 2001 From: "Hartung, Michael" <michael.hartung@uni-hamburg.de> Date: Fri, 21 Apr 2023 19:16:51 +0200 Subject: [PATCH] map_nodes: remove empty node ids; compact_ids: fallack to input id if protein does not have id in target space Former-commit-id: 712b553579c8857e63d895df715677919b31cec2 [formerly 320590f9cd52ce892d309763aca62ee26b8305b1] Former-commit-id: b1ab54bc90f5c43527fcf914183c0cbd89350c36 --- drugstone/util/query_db.py | 27 ++++++++++++++++++++++----- drugstone/views.py | 7 +++++++ 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/drugstone/util/query_db.py b/drugstone/util/query_db.py index a589cf2..8ccd9e8 100644 --- a/drugstone/util/query_db.py +++ b/drugstone/util/query_db.py @@ -7,6 +7,16 @@ from drugstone.models import Protein, EnsemblGene from drugstone.serializers import ProteinSerializer +MAP_ID_SPACE_COMPACT_TO_DRUGSTONE = { + 'symbol:': 'symbol', + 'uniprot:': 'uniprot', + 'ensg:': 'ensg', + 'ncbigene:': 'entrez', + 'ensembl:': 'ensg', + 'entrez:': 'entrez' +} + + def query_proteins_by_identifier(node_ids: Set[str], identifier: str) -> Tuple[List[dict], str]: """Queries the django database Protein table given a list of identifiers (node_ids) and a identifier name (identifier). @@ -66,13 +76,13 @@ def query_proteins_by_identifier(node_ids: Set[str], identifier: str) -> Tuple[L def get_protein_ids(id_space, proteins): if (id_space == 'uniprot'): - return [p['uniprot'] for p in proteins] + return {p['uniprot'] for p in proteins} if (id_space == 'ensg' or id_space == 'ensembl'): - return [p['ensg'] for p in proteins] + return {p['ensg'] for p in proteins} if (id_space == 'symbol'): - return [p['symbol'] for p in proteins] + return {p['symbol'] for p in proteins} if (id_space == 'entrez' or id_space == 'ncbigene'): - return [p['entrez'] for p in proteins] + return {p['entrez'] for p in proteins} return set() @@ -136,7 +146,14 @@ def clean_proteins_from_compact_notation(node_ids: Set[str], identifier: str) -> continue q_list = reduce(lambda a, b: a | b, q_list) proteins = ProteinSerializer(many=True).to_representation(Protein.objects.filter(q_list)) - clean_ids = clean_ids.union(get_protein_ids(identifier, proteins)) + # if protein could not be mapped + clean_ids_temp = get_protein_ids(identifier, proteins) + if '' in clean_ids_temp: + clean_ids_temp.remove('') + # at least one protein could not be found in id space, use original id as placeholder + ids_placeholder = {p[MAP_ID_SPACE_COMPACT_TO_DRUGSTONE[id_space]] for p in proteins if p[identifier] == ''} + clean_ids_temp |= ids_placeholder + clean_ids |= clean_ids_temp return list(clean_ids) diff --git a/drugstone/views.py b/drugstone/views.py index 89081ab..366af9f 100755 --- a/drugstone/views.py +++ b/drugstone/views.py @@ -175,11 +175,18 @@ def map_nodes(request) -> Response: """ # load data from request nodes = request.data.get('nodes', '[]') + id_map = {} + nodes_clean = [] for node in nodes: + if not node['id']: + # skip empty node id '' + continue upper = node['id'].upper() id_map[upper] = node['id'] node['id'] = upper + nodes_clean.append(node) + nodes = nodes_clean identifier = request.data.get('identifier', '') # extract ids for filtering -- GitLab