diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index 686cfd4cd0d4e1305f44ed21fa9697cc5f3fc8c9..0000000000000000000000000000000000000000 --- a/.gitmodules +++ /dev/null @@ -1,3 +0,0 @@ -[submodule "python_nedrex"] - path = python_nedrex - url = git@github.com:repotrial/python_nedrex.git diff --git a/docker-compose.yml b/docker-compose.yml index 3a1954650eb74eacef931270884db25083654b9f..39125321c7c37ae6f6bd95be87755bb83b338d20 100755 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -12,7 +12,7 @@ services: - 'docker-django.env.dev' restart: always # volumes: -# - ./:/usr/src/drugstone/ +# - drugstone_backend_volume:/usr/src/drugstone/ ports: - 8001:8000 networks: @@ -97,4 +97,5 @@ networks: drugstone_net: volumes: - drugstone_postgres_volume: \ No newline at end of file + drugstone_postgres_volume: + drugstone_backend_volume: \ No newline at end of file diff --git a/drugstone/management/commands/import_from_nedrex.py b/drugstone/management/commands/import_from_nedrex.py index cc6a26b24b37ab5910a54d461e18af39a3b9f63f..75b6ab25b0eb29f3c92c9cef560fd0bc31271154 100644 --- a/drugstone/management/commands/import_from_nedrex.py +++ b/drugstone/management/commands/import_from_nedrex.py @@ -1,69 +1,180 @@ from collections import defaultdict +import python_nedrex as nedrex +from python_nedrex.core import get_nodes, get_edges, get_api_key + +from drugstone import models + + +def iter_node_collection(coll_name, eval): + offset = 0 + limit = 10000 + while True: + result = get_nodes(coll_name, offset=offset, limit=limit) + if not result: + return + for node in result: + eval(node) + offset += limit + + +def iter_edge_collection(coll_name, eval): + offset = 0 + limit = 10000 + while True: + result = get_edges(coll_name, offset=offset, limit=limit) + if not result: + return + for edge in result: + eval(edge) + offset += limit + + +def identify_updates(new_list, old_list): + u = list() + c = list() + for id in new_list: + if id not in old_list: + c.append(id) + elif new_list[id] != old_list[id]: + old_list[id].update(new_list[id]) + u.append(old_list[id]) + return u, c -class nedrex_importer: +def format_list(l): + if l is not None and len(l) > 0: + s = str(l)[1:] + return s[:len(s) - 1] + return "" + + +class nedrex_importer: proteins = dict() + entrez_to_uniprot = dict() + gene_name_to_uniprot = defaultdict(lambda: set()) + disorders = dict() + drugs = dict() + + def __init__(self, base_url): + nedrex.config.set_url_base(base_url) + api_key = get_api_key(accept_eula=True) + nedrex.config.set_api_key(api_key) + + def init_proteins(self): + if len(self.proteins) == 0: + print("Generating protein maps...") + for protein in models.Protein.objects.all(): + self.proteins[protein.entrez] = protein + self.entrez_to_uniprot[protein.entrez] = protein.uniprot_code + self.gene_name_to_uniprot[protein.gene].add(protein.uniprot_code) + + def init_drugs(self): + if len(self.drugs) == 0: + print("Generating drug map...") + for drug in models.Drug.objects.all(): + self.drugs[drug.drug_id] = drug - def import_proteins(self): - import python_nedrex as nedrex - from python_nedrex.core import get_nodes, get_api_key, get_edges - from drugstone.models import Protein + def init_disorders(self): + if len(self.disorders) == 0: + print("Generating disorder map...") + for disorder in models.Disorder.objects.all(): + self.disorders[disorder.mondo_id] = disorder + def import_proteins(self, update: bool): + proteins = dict() gene_to_prots = defaultdict(lambda: set()) - def iter_node_collection(coll_name, eval): - offset = 0 - limit = 10000 - while True: - result = get_nodes(coll_name, offset=offset, limit=limit) - if not result: - return - for node in result: - eval(node) - offset += limit - - def iter_edge_collection(coll_name, eval): - offset = 0 - limit = 10000 - while True: - result = get_edges(coll_name, offset=offset, limit=limit) - if not result: - return - for edge in result: - eval(edge) - offset += limit + if update: + self.init_proteins() def add_protein(node): - id = node['primaryDomainId'] - self.proteins[id] = Protein(uniprot_code=id.split('.')[1], gene=node['geneName']) + print(node) + id = node['primaryDomainId'].split('.')[1] + name = node['geneName'] + if len(node['synonyms']) > 0: + name = node['synonyms'][0] + idx = name.index('{') + if idx > 0: + name = name[idx - 1:] + proteins[id] = models.Protein(uniprot_code=id, name=name, gene=node['geneName']) def add_edges(edge): - id = edge['sourceDomainId'] - protein = self.proteins[id] + id = edge['sourceDomainId'].split('.')[1] + protein = proteins[id] protein.entrez = edge['targetDomainId'].split('.')[1] gene_to_prots[edge['targetDomainId']].add(id) def add_genes(node): - id = node['primaryDomainId'] + id = node['primaryDomainId'].split('.')[1] for prot_id in gene_to_prots[id]: - protein = self.proteins[prot_id] + protein = proteins[prot_id] try: protein.protein_name = node['synonyms'][0] except: pass - nedrex.config.set_url_base("http://82.148.225.92:8123/") - api_key = get_api_key(accept_eula=True) - nedrex.config.set_api_key(api_key) - - - - print('Importing Proteins') iter_node_collection('protein', add_protein) - print('Importing Protein-Gene mapping') iter_edge_collection('protein_encoded_by_gene', add_edges) - print('Mapping Gene information') iter_node_collection('gene', add_genes) - Protein.objects.bulk_create(self.proteins.values()) + # TODO test updating ideas + if update: + (updates, creates) = identify_updates(proteins, self.proteins) + models.Protein.objects.bulk_update(updates) + models.Protein.objects.bulk_create(creates) + for protein in creates: + self.proteins[protein.uniprot_code] = protein + else: + models.Protein.objects.bulk_create(self.proteins.values()) + self.proteins = proteins return len(self.proteins) + + def import_drugs(self, update): + drugs = dict() + if update: + self.init_drugs() + + def add_drug(node): + id = node['primaryDomainId'].split('.')[1] + drugs[id] = models.Drug(drug_id=id, name=node['displayName'], status=format_list(node['drugGroups'])) + + iter_node_collection('drug', add_drug) + + # TODO test updating ideas + if update: + (updates, creates) = identify_updates(drugs, self.drugs) + models.Drug.objects.bulk_update(updates) + models.Drug.objects.bulk_create(creates) + for drug in creates: + self.drugs[drug.drug_id] = drug + else: + models.Drug.objects.bulk_create(self.drugs.values()) + self.drugs = drugs + + self.drugs = drugs + return len(self.drugs) + + def import_disorders(self, update): + disorders = dict() + if update: + self.init_disorders() + + def add_disorder(node): + id = node['primaryDomainId'].split('.')[1] + self.disorders[id] = models.Disorder(mondo_id=id, label=node['displayName'], icd10=format_list(node['icd10'])) + + iter_node_collection('disorder', add_disorder) + + # TODO test updating ideas + if update: + (updates, creates) = identify_updates(disorders, self.disorders) + models.Disorder.objects.bulk_update(updates) + models.Disorder.objects.bulk_create(creates) + for disorder in creates: + self.disorders[disorder.uniprot_code] = disorder + else: + models.Disorder.objects.bulk_create(self.disorders.values()) + self.disorders = disorders + + self.disorders = disorders + return len(self.disorders) diff --git a/drugstone/management/commands/populate_db.py b/drugstone/management/commands/populate_db.py index a0f7693b515e7de3445e446099d6448df89e9316..3b4d1d145e2b19c9489e00f374e0fe75b4c65114 100755 --- a/drugstone/management/commands/populate_db.py +++ b/drugstone/management/commands/populate_db.py @@ -99,7 +99,7 @@ class Command(BaseCommand): tissue_expression_file=exp_file, ) - importer = nedrex_importer() + importer = nedrex_importer("http://82.148.225.92:8123/") if kwargs['delete_model'] is not None: model_list = kwargs['delete_model'].split(',') @@ -111,6 +111,7 @@ class Command(BaseCommand): if kwargs['drug_file'] is not None: print('Populating Drugs...') n = DataPopulator.populate_drugs(populator) + # n = nedrex_importer.import_drugs(importer,False) print(f'Populated {n} Drugs.') # if kwargs['protein_file'] is not None: @@ -130,8 +131,8 @@ class Command(BaseCommand): if kwargs['proteins'] is not None: print('Populating Proteins...') - n = nedrex_importer.import_proteins(nedrex_importer) - # n = DataPopulator.populate_proteins(populator) + # n = nedrex_importer.import_proteins(importer, False) + n = DataPopulator.populate_proteins(populator) print(f'Populated {n} Proteins.') print('Populating ENSG IDs...') @@ -140,6 +141,7 @@ class Command(BaseCommand): if kwargs['disorders'] is not None: print('Populating Disorders...') + # n = nedrex_importer.import_disorders(importer, False) n = DataPopulator.populate_disorders(populator) print(f'Populated {n} Disorders.') diff --git a/drugstone/management/includes/DataPopulator.py b/drugstone/management/includes/DataPopulator.py index 2b101ed26219bdfb7efb2bf0cbe4a6e257fa331a..f26a9653c01a1ea93e1ea0c2549ec97ce5802438 100755 --- a/drugstone/management/includes/DataPopulator.py +++ b/drugstone/management/includes/DataPopulator.py @@ -8,7 +8,6 @@ class DataPopulator: proteins = dict() uniprot_to_ensembl = dict() gene_name_to_ensembl = defaultdict(lambda: set()) - # protein_name_to_ensembl = dict() disorders = dict() drugs = dict() @@ -19,7 +18,6 @@ class DataPopulator: self.proteins[protein.entrez]=protein self.uniprot_to_ensembl[protein.uniprot_code] = protein.entrez self.gene_name_to_ensembl[protein.gene].add(protein.entrez) - # self.protein_name_to_ensembl[protein.protein_name] = protein.entrez def init_drugs(self): if len(self.drugs)== 0: @@ -49,7 +47,6 @@ class DataPopulator: protein_name=row['protein_name']) self.uniprot_to_ensembl[row['protein_ac']] = row['entrez_id'] self.gene_name_to_ensembl[row['gene_name']].add(row['entrez_id']) - # self.protein_name_to_ensembl[row['protein_name']] = row['entrez_id'] models.Protein.objects.bulk_create(self.proteins.values()) return len(self.proteins) diff --git a/drugstone/models.py b/drugstone/models.py index d28eab816461b30f252fe565fa290a250a841eba..0d38f462fe14cd975e6818bc5f7f80cddcf81f60 100755 --- a/drugstone/models.py +++ b/drugstone/models.py @@ -79,7 +79,7 @@ class Protein(models.Model): # are either 6 or 10 characters long uniprot_code = models.CharField(max_length=10) - gene = models.CharField(max_length=128, default='') # symbol + gene = models.CharField(max_length=128, default='') # symbol protein_name = models.CharField(max_length=128, default='') entrez = models.CharField(max_length=128, default='') drugs = models.ManyToManyField('Drug', through='ProteinDrugInteraction', @@ -93,10 +93,21 @@ class Protein(models.Model): def __str__(self): return self.gene + def __eq__(self, other): + return self.uniprot_code == other.uniprot_code and self.gene == other.gene and self.protein_name == other.protein_name and self.entrez == other.entrez + + def __ne__(self, other): + return not self.__eq__(other) + + def update(self, other): + self.uniprot_code = other.uniprot_code + self.gene = other.gene + self.protein_name = other.protein_name + self.entrez = other.entrez class Disorder(models.Model): mondo_id = models.CharField(max_length=7) - label = models.CharField(max_length=256, default='') # symbol + label = models.CharField(max_length=256, default='') # symbol icd10 = models.CharField(max_length=128, default='') proteins = models.ManyToManyField( 'Protein', through='ProteinDisorderAssociation', related_name='associated_proteins') @@ -107,6 +118,40 @@ class Disorder(models.Model): def __str__(self): return self.label + def __eq__(self, other): + return self.mondo_id == other.mondo_id and self.label == other.label and self.icd10 == other.icd10 + + def __ne__(self, other): + return not self.__eq__(other) + + def update(self,other): + self.mondo_id = other.mondo_id + self.label = other.label + self.icd10 = other.icd10 + + +class Drug(models.Model): + drug_id = models.CharField(max_length=10, unique=True) + name = models.CharField(max_length=256, default='') + status = models.CharField(max_length=128, default='') + # in_trial = models.BooleanField(default=False) + # in_literature = models.BooleanField(default=False) + links = models.CharField(max_length=16 * 1024, default='') + + def __str__(self): + return self.drug_id + + def __eq__(self, other): + return self.drug_id == other.uniprot_code and self.name == other.name and self.status == other.status + + def __ne__(self,other): + return not self.__eq__(other) + + def update(self, other): + self.drug_id = other.drug_id + self.name = other.name + self.status = other.status + self.links = other.links class ProteinDisorderAssociation(models.Model): pdis_dataset = models.ForeignKey( @@ -121,7 +166,6 @@ class ProteinDisorderAssociation(models.Model): def __str__(self): return f'{self.pdis_dataset}-{self.protein}-{self.disorder}' - class DrugDisorderIndication(models.Model): drdi_dataset = models.ForeignKey( 'DrDiDataset', null=True, on_delete=models.CASCADE, related_name='drdi_dataset_relation') @@ -135,18 +179,6 @@ class DrugDisorderIndication(models.Model): return f'{self.drdi_dataset}-{self.drug}-{self.disorder}' -class Drug(models.Model): - drug_id = models.CharField(max_length=10, unique=True) - name = models.CharField(max_length=256, default='') - status = models.CharField(max_length=128, default='') - # in_trial = models.BooleanField(default=False) - # in_literature = models.BooleanField(default=False) - links = models.CharField(max_length=16*1024, default='') - - def __str__(self): - return self.drug_id - - class ProteinProteinInteraction(models.Model): ppi_dataset = models.ForeignKey( 'PPIDataset', null=True, on_delete=models.CASCADE, related_name='ppi_dataset_relation') @@ -158,12 +190,12 @@ class ProteinProteinInteraction(models.Model): from_protein=self.from_protein, to_protein=self.to_protein, ppi_dataset=self.ppi_dataset - ) + ) p2p1_q = ProteinProteinInteraction.objects.filter( from_protein=self.to_protein, to_protein=self.from_protein, ppi_dataset=self.ppi_dataset - ) + ) if p1p2_q.exists() or p2p1_q.exists(): raise ValidationError('Protein-Protein interaction must be unique!') @@ -175,7 +207,6 @@ class ProteinProteinInteraction(models.Model): def __str__(self): return f'{self.ppi_dataset}-{self.from_protein}-{self.to_protein}' - class ProteinDrugInteraction(models.Model): pdi_dataset = models.ForeignKey( 'PDIDataset', null=True, on_delete=models.CASCADE, related_name='pdi_dataset_relation') @@ -188,7 +219,6 @@ class ProteinDrugInteraction(models.Model): def __str__(self): return f'{self.pdi_dataset}-{self.protein}-{self.drug}' - class Task(models.Model): token = models.CharField(max_length=32, unique=True) created_at = models.DateTimeField(auto_now_add=True) @@ -208,7 +238,6 @@ class Task(models.Model): result = models.TextField(null=True) - class Network(models.Model): id = models.CharField(primary_key=True, max_length=32, unique=True) created_at = models.DateTimeField(auto_now_add=True)