From dc07f19540c31b7269de93acc57e0a7562667634 Mon Sep 17 00:00:00 2001 From: AndiMajore <andi.majore@googlemail.com> Date: Thu, 21 Jul 2022 21:01:44 +0200 Subject: [PATCH] changed to new node schema --- drugstone/serializers.py | 35 ++++++++++ drugstone/tasks.py | 2 +- drugstone/util/query_db.py | 22 ++++--- drugstone/views.py | 127 +++++++++++++++++++++++-------------- 4 files changed, 129 insertions(+), 57 deletions(-) diff --git a/drugstone/serializers.py b/drugstone/serializers.py index 666943d..1e92af8 100755 --- a/drugstone/serializers.py +++ b/drugstone/serializers.py @@ -17,6 +17,41 @@ class PPIDatasetSerializer(serializers.ModelSerializer): model = models.PPIDataset fields = '__all__' +class ProteinNodeSerializer(serializers.ModelSerializer): + drugstone_id = serializers.SerializerMethodField() + uniprot_ac = serializers.SerializerMethodField() + symbol = serializers.SerializerMethodField() + ensg = serializers.SerializerMethodField() + entrez = serializers.SerializerMethodField() + + def get_drugstone_id(self, obj): + return [f'p{obj.id}'] + + def get_uniprot_ac(self, obj): + return [obj.uniprot_code] + + def get_symbol(self, obj): + return [obj.gene] + + def get_entrez(self,obj): + return [obj.entrez] + + def get_ensg(self, obj) -> str: + """Since ENSG has a many to one relationship to the Protein table, + return a list of all matching ensg names. + + Args: + obj (Protein): Protein object + + Returns: + str: list of all matching ENSG numbers + """ + return [x.name for x in obj.ensg.all()] + + class Meta: + model = Protein + fields = ['drugstone_id', 'uniprot_ac', 'symbol', 'protein_name', 'entrez', 'ensg'] + class ProteinSerializer(serializers.ModelSerializer): drugstone_id = serializers.SerializerMethodField() diff --git a/drugstone/tasks.py b/drugstone/tasks.py index 97cc9f3..fd09e65 100644 --- a/drugstone/tasks.py +++ b/drugstone/tasks.py @@ -19,7 +19,7 @@ def task_update_db_from_nedrex(): if n > 0: logger.info('Recreating networks...') proc = subprocess.Popen(['python3', '/usr/src/drugstone/manage.py', 'make_graphs']) - out,err = proc.communicate() + out, err = proc.communicate() print(out) print(err) logger.info('Done.') diff --git a/drugstone/util/query_db.py b/drugstone/util/query_db.py index 89fd8be..ba26adb 100644 --- a/drugstone/util/query_db.py +++ b/drugstone/util/query_db.py @@ -1,3 +1,4 @@ +import copy from collections import defaultdict from typing import List, Tuple, Set, OrderedDict from functools import reduce @@ -22,7 +23,6 @@ def query_proteins_by_identifier(node_ids: Set[str], identifier: str) -> Tuple[L Returns list of serialized protein entries for all matched IDs Returns name of backend attribute of Protein table """ - # query protein table if identifier == 'symbol': protein_attribute = 'symbol' @@ -32,9 +32,9 @@ def query_proteins_by_identifier(node_ids: Set[str], identifier: str) -> Tuple[L q_list = map(lambda n: Q(uniprot_code__iexact=n), node_ids) elif identifier == 'ensg': protein_attribute = 'ensg' - node_ids = map(lambda n: n.protein_id, EnsemblGene.objects.filter( + dr_ids = map(lambda n: n.protein_id, EnsemblGene.objects.filter( reduce(lambda a, b: a | b, map(lambda n: Q(name__iexact=n), list(node_ids))))) - q_list = map(lambda n: Q(id=n), node_ids) + q_list = map(lambda n: Q(id=n), dr_ids) elif identifier == 'entrez': protein_attribute = 'entrez' q_list = map(lambda n: Q(entrez=n), node_ids) @@ -45,11 +45,17 @@ def query_proteins_by_identifier(node_ids: Set[str], identifier: str) -> Tuple[L node_objects = Protein.objects.filter(q_list) nodes = list() - node_map = defaultdict(list) - - for node in ProteinSerializer(many=True).to_representation(node_objects): - node_map[node.get(protein_attribute)].append(node) + if identifier == 'ensg': + for node in ProteinSerializer(many=True).to_representation(node_objects): + for ensembl_id in node.get(protein_attribute): + if ensembl_id.upper() in node_ids: + node = copy.copy(node) + node[identifier] = ensembl_id + node_map[ensembl_id].append(node) + else: + for node in ProteinSerializer(many=True).to_representation(node_objects): + node_map[node.get(protein_attribute)].append(node) for node_id, entries in node_map.items(): nodes.append(aggregate_nodes(entries)) @@ -60,7 +66,7 @@ def aggregate_nodes(nodes: List[OrderedDict]): node = defaultdict(set) for n in nodes: for key, value in n.items(): - if isinstance(value,list): + if isinstance(value, list): for e in value: node[key].add(e) else: diff --git a/drugstone/views.py b/drugstone/views.py index 038fa88..d923139 100755 --- a/drugstone/views.py +++ b/drugstone/views.py @@ -4,6 +4,8 @@ import random import string import time import uuid +from collections import defaultdict + import pandas as pd from typing import Tuple @@ -58,12 +60,12 @@ def get_pdis_ds(source, licenced): def get_drdis_ds(source, licenced): try: - ds = models.PDisDataset.objects.filter(name__iexact=source, licenced=licenced).last() + ds = models.DrDiDataset.objects.filter(name__iexact=source, licenced=licenced).last() ds.id return ds except: if licenced: - return get_pdis_ds(source, False) + return get_drdis_ds(source, False) return None @@ -180,6 +182,7 @@ def map_nodes(request) -> Response: # nodes_mapped_dict = {node_id: node for node in nodes_mapped for node_id in node[id_key]} # else: nodes_mapped_dict = {node[id_key][0]: node for node in nodes_mapped} + # merge fetched data with given data to avoid data loss for node in nodes: node['drugstoneType'] = 'other' @@ -257,74 +260,49 @@ def result_view(request) -> Response: if not node_attributes: node_attributes = {} result['node_attributes'] = node_attributes + proteins = [] drugs = [] network = result['network'] - node_types = node_attributes.get('node_types') - if not node_types: - node_types = {} - node_attributes['node_types'] = node_types - is_seed = node_attributes.get('is_seed') - if not is_seed: - is_seed = {} - node_attributes['is_seed'] = is_seed + node_types = {} + node_attributes['node_types'] = node_types + is_seed = {} + node_attributes['is_seed'] = is_seed scores = node_attributes.get('scores', {}) node_details = {} + protein_id_map = defaultdict(set) node_attributes['details'] = node_details parameters = json.loads(task.parameters) seeds = parameters['seeds'] nodes = network['nodes'] - # edges = network['edges'] - for node_id in nodes: - is_seed[node_id] = node_id in seeds - node_type = node_types.get(node_id).lower() - pvd_entity = None - details_s = None - if node_type == 'protein': - pvd_entity = Protein.objects.get(id=int(node_id[1:])) - elif node_type == 'drug': - pvd_entity = Drug.objects.get(id=int(node_id[2:])) - - if not node_type or not pvd_entity: - continue - if node_type == 'protein': - details_s = ProteinSerializer().to_representation(pvd_entity) - elif node_type == 'drug': - details_s = DrugSerializer().to_representation(pvd_entity) - node_types[node_id] = node_type - - if scores.get(node_id) is not None: - details_s['score'] = scores.get(node_id, None) - node_details[node_id] = details_s - if node_type == 'protein': - proteins.append(details_s) - elif node_type == 'drug': - drugs.append(details_s) + parameters = task_parameters(task) # attach input parameters to output result['parameters'] = parameters + identifier_nodes = set() + identifier = parameters['config']['identifier'] - # TODO move the merging to "scores to result" # merge input network with result network for node in parameters['input_network']['nodes']: # if node was already mapped, add user defined values to result of analysis - if node_name_attribute in node: - if node[node_name_attribute] in node_details: + if identifier in identifier_nodes: + node_name = node[identifier][0] + if node_name in node_details: # update the node to not lose user input attributes - node_details[node[node_name_attribute]].update(node) + node_details[node_name].update(node) # skip adding node if node already exists in analysis output to avoid duplicates else: # node does not exist in analysis output yet, was added by user but not used as seed - node_details[node[node_name_attribute]] = node + node_details[node_name] = node # append mapped input node to analysis result - nodes.append(node[node_name_attribute]) + nodes.append(node_name) # manually add node to node types - result['node_attributes']['node_types'][node[node_name_attribute]] = 'protein' + result['node_attributes']['node_types'][node_name] = 'protein' else: # node is custom node from user, not mapped to drugstone but will be displayed with all custom attributes node_id = node['id'] - nodes.append(node_id) + identifier_nodes.add(node_id) node_details[node_id] = node is_seed[node_id] = False # append custom node to analysis result later on @@ -332,15 +310,62 @@ def result_view(request) -> Response: result['node_attributes']['node_types'][node_id] = 'custom' # extend the analysis network by the input netword nodes # map edge endpoints to database proteins if possible and add edges to analysis network - identifier = parameters['config']['identifier'] + + # mapping all new protein and drug nodes by drugstoneIDs + adding scores + for node_id in nodes: + + if node_id[0] == 'p': + node_data = ProteinNodeSerializer().to_representation(Protein.objects.get(id=int(node_id[1:]))) + # proteins.append(node_data) + node_ident = node_data[identifier][0] + # node_data[identifier] = [node_ident] + protein_id_map[node_ident].add(node_id) + identifier_nodes.add(node_ident) + is_seed[node_ident] = node_id in seeds or (is_seed[node_ident] if node_ident in is_seed else False) + node_types[node_ident] = 'protein' + score = scores.get(node_id, None) + if node_ident in node_details: + data = node_details[node_ident] + data['entrez'].extend(node_data['entrez']) + data['ensg'].extend(node_data['ensg']) + data['symbol'].extend(node_data['symbol']) + data['uniprot_ac'].extend(node_data['uniprot_ac']) + if score: + if 'score' in data: + data['score'].append(score) + else: + data['score'] = [score] if score else [] + else: + node_data['score'] = [score] if score else [] + node_data['drugstoneType'] = 'protein' + node_data['id'] = node_ident + node_data['label'] = node_ident + node_details[node_ident] = node_data + + elif node_id[:2] == 'dr': + node_data = DrugSerializer().to_representation(Drug.objects.get(id=int(node_id[2:]))) + drugs.append(node_data) + if node_id in scores: + node_data['score'] = scores.get(node_id, None) + node_types[node_id] = 'drug' + node_details[node_id] = node_data + else: + continue + for node_id, detail in node_details.items(): + detail['symbol'] = list(set(detail['symbol'])) + detail['entrez'] = list(set(detail['entrez'])) + detail['uniprot_ac'] = list(set(detail['uniprot_ac'])) + detail['ensg'] = list(set(detail['ensg'])) + edges = parameters['input_network']['edges'] edge_endpoint_ids = set() + # TODO check for custom edges when working again for edge in edges: edge_endpoint_ids.add(edge['from']) edge_endpoint_ids.add(edge['to']) - # query protein table nodes_mapped, id_key = query_proteins_by_identifier(edge_endpoint_ids, identifier) + # change data structure to dict in order to be quicker when merging nodes_mapped_dict = {node[id_key]: node for node in nodes_mapped} for edge in edges: @@ -350,8 +375,10 @@ def result_view(request) -> Response: edge['to'] = nodes_mapped_dict[edge['to']][node_name_attribute] if edge['to'] in nodes_mapped_dict else edge[ 'to'] if 'autofill_edges' in parameters['config'] and parameters['config']['autofill_edges']: - proteins = set(map(lambda n: n[node_name_attribute][1:], - filter(lambda n: node_name_attribute in n, parameters['input_network']['nodes']))) + proteins = {node_name[1:] for nodes in map(lambda n: n[node_name_attribute], + filter(lambda n: node_name_attribute in n, + parameters['input_network']['nodes'])) for node_name in nodes} + dataset = DEFAULTS['ppi'] if 'interaction_protein_protein' not in parameters['config'] else \ parameters['config'][ 'interaction_protein_protein'] @@ -362,6 +389,9 @@ def result_view(request) -> Response: map(lambda n: {"from": f'p{n.from_protein_id}', "to": f'p{n.to_protein_id}'}, interaction_objects)) edges.extend(auto_edges) result['network']['edges'].extend(edges) + result['network']['nodes'] = list(identifier_nodes) + if 'scores' in result['node_attributes']: + del result['node_attributes']['scores'] if not view: return Response(result) @@ -375,6 +405,7 @@ def result_view(request) -> Response: 'gene': i['symbol'], 'name': i['protein_name'], 'ensg': i['ensg'], + 'entrez': i['entrez'], 'seed': is_seed[i[node_name_attribute]], } if i.get('score'): -- GitLab