switched to bulk inserts

8fe8745b · AndiMajore · 267b8afd · 8fe8745b · 8fe8745b · 8fe8745b
Commit 8fe8745b authored 3 years ago by AndiMajore
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -11,8 +11,8 @@ services:
    env_file:
      - 'docker-django.env.dev'
    restart: always
-    volumes:
-      - ./:/usr/src/drugstone/
+#    volumes:
+#      - ./:/usr/src/drugstone/
    ports:
      - 8001:8000
    networks:
@@ -60,8 +60,8 @@ services:
    hostname: drugstone_celery
    env_file:
      - './docker-django.env.dev'
-    volumes:
-      - ./:/usr/src/drugstone/
+#    volumes:
+#      - ./:/usr/src/drugstone/
    depends_on:
      - redis
      - db
@@ -76,8 +76,8 @@ services:
    hostname: drugstone_celery_beat
    env_file:
      - './docker-django.env.dev'
-    volumes:
-      - ./:/usr/src/drugstone/
+#    volumes:
+#      - ./:/usr/src/drugstone/
    depends_on:
      - redis
      - db

--- a/drugstone/management/commands/import_from_nedrex.py
+++ b/drugstone/management/commands/import_from_nedrex.py
+# from collections import defaultdict
+#
+#
+# def import_proteins():
+#     import python_nedrex as nedrex
+#     from python_nedrex.core import get_nodes, get_api_key, get_edges
+#     from models import Protein
+#
+#     def iter_node_collection(coll_name, eval):
+#         offset = 0
+#         limit = 10000
+#         while True:
+#             result = get_nodes(coll_name, offset=offset, limit=limit)
+#             if not result:
+#                 return
+#             for node in result:
+#                 eval(node)
+#             offset += limit
+#
+#     def iter_edge_collection(coll_name, eval):
+#         offset = 0
+#         limit = 10000
+#         while True:
+#             result = get_edges(coll_name, offset=offset, limit=limit)
+#             if not result:
+#                 return
+#             for edge in result:
+#                 eval(edge)
+#             offset += limit
+#
+#     def add_protein(node):
+#         global proteins
+#         id = node['primaryDomainId']
+#         proteins[id] = Protein(uniprot_code=id.split('.')[1], gene=node['geneName'])
+#
+#     def add_edges(edge):
+#         global proteins
+#         id = edge['sourceDomainId']
+#         protein = proteins[id]
+#         protein.entrez = edge['targetDomainId'].split('.')[1]
+#         global gene_to_prots
+#         gene_to_prots[edge['targetDomainId']].add(id)
+#
+#     def add_genes(node):
+#         global proteins
+#         global gene_to_prots
+#         id = node['primaryDomainId']
+#         for prot_id in gene_to_prots[id]:
+#             protein = proteins[prot_id]
+#             try:
+#                 protein.protein_name = node['synonyms'][0]
+#             except:
+#                 pass
+#
+#     nedrex.config.set_url_base("http://82.148.225.92:8123/")
+#     api_key = get_api_key(accept_eula=True)
+#     nedrex.config.set_api_key(api_key)
+#
+#     proteins = dict()
+#     gene_to_prots = defaultdict(lambda: set())
+#
+#     print('Importing Proteins')
+#     iter_node_collection('protein', add_protein)
+#     print('Importing Protein-Gene mapping')
+#     iter_edge_collection('protein_encoded_by_gene', add_edges)
+#     print('Mapping Gene information')
+#     iter_node_collection('gene', add_genes)
+#     Protein.objects.bulk_create(proteins.values())
--- a/drugstone/management/commands/populate_db.py
+++ b/drugstone/management/commands/populate_db.py
@@ -74,47 +74,6 @@ class DatabasePopulator:

        print('Done!\n')

-    # def populate_protein_model(self):
-    #     print('Populating Protein model ...')
-    #     protein_df = pd.read_csv(f'{self.data_dir}/data_drugstone/{self.protein_file}', delimiter='\t')
-    #     for _, row in protein_df.iterrows():
-    #         protein_ac = row['protein_ac']
-    #         gene_name = row['gene_name']
-    #         protein_name = row['protein_name']
-
-    #         if gene_name == 'None':
-    #             gene_name = ''
-
-    #         protein_object = Protein(uniprot_code=protein_ac, gene=gene_name, protein_name=protein_name)
-    #         protein_object.save()
-
-    #     print('Done!\n')
-
-    # def populate_pdi_model(self):
-    #     print('Populating ProteinDrugInteraction model ...')
-    #     pdi_df = pd.read_csv(f'{self.data_dir}/data_drugstone/{self.pdi_file}', delimiter='\t')
-
-    #     for _, row in pdi_df.iterrows():
-    #         protein_ac = row['protein_ac']
-    #         drug_id = row['drug_id']
-
-    #         try:
-    #             protein_object = Protein.objects.get(uniprot_code=protein_ac)
-    #         except Protein.DoesNotExist:
-    #             print(f'Protein AC {protein_ac} not found in Protein model!')
-    #             continue
-
-    #         try:
-    #             drug_object = Drug.objects.get(drug_id=drug_id)
-    #         except Drug.DoesNotExist:
-    #             print(f'Drug ID {drug_id} not found in Drug model!')
-    #             continue
-
-    #         # insert protein-drug to PDI model
-    #         pdi_object = ProteinDrugInteraction(protein=protein_object, drug=drug_object)
-    #         pdi_object.save()
-
-    #     print('Done!\n')

    def populate_exp_model(self):
        print('Populating Tissue and ExpressionLevel model ...')
@@ -146,32 +105,6 @@ class DatabasePopulator:

        print(f'Added {proteins_linked} expression levels!\n')

-    # def populate_ppi_model(self):
-    #     print('Populating ProteinProteinInteraction model ...')
-    #     pdi_df = pd.read_csv(f'{self.data_dir}/data_drugstone/{self.ppi_file}', delimiter='\t')
-
-    #     for _, row in pdi_df.iterrows():
-
-    #         from_protein_ac = row['from_protein_ac']
-    #         to_protein_ac = row['to_protein_ac']
-
-    #         try:
-    #             from_protein_object = Protein.objects.get(uniprot_code=from_protein_ac)
-    #         except Protein.DoesNotExist:
-    #             print(f'Protein AC {from_protein_ac} not found in Protein model!')
-    #             continue
-
-    #         try:
-    #             to_protein_object = Protein.objects.get(uniprot_code=to_protein_ac)
-    #         except Protein.DoesNotExist:
-    #             print(f'Protein AC {to_protein_ac} not found in Protein model!')
-    #             continue
-
-    #         # insert protein-protein edge to ProteinProteinInteraction model
-    #         ppi_object = ProteinProteinInteraction(from_protein=from_protein_object, to_protein=to_protein_object)
-    #         ppi_object.save()
-
-    #     print('Done!\n')


 class Command(BaseCommand):

--- a/drugstone/management/includes/DataPopulator.py
+++ b/drugstone/management/includes/DataPopulator.py
@@ -12,17 +12,16 @@ class DataPopulator:
            int: Count of how many proteins were added
        """
        df = DataLoader.load_proteins()
-        count = 0
+        proteins = list()
        for _, row in df.iterrows():
-            _, created = models.Protein.objects.update_or_create(
+            proteins.append(models.Protein(
                uniprot_code=row['protein_ac'],
                gene=row['gene_name'],
                entrez=row['entrez_id'],
-                defaults={'protein_name': row['protein_name']}
+                protein_name=row['protein_name'])
            )
-            if created:
-                count += 1
-        return count
+        models.Protein.objects.bulk_create(proteins)
+        return len(proteins)

    def populate_disorders() -> int:
        """ Populates the Disorder table in the django database.
@@ -32,17 +31,15 @@ class DataPopulator:
            int: Count of how many disorders were added
        """
        df = DataLoader.load_disorders()
-        count = 0
+        bulk = list()
        for _, row in df.iterrows():
-            _, created = models.Disorder.objects.update_or_create(
+            bulk.append(models.Disorder(
                mondo_id=row['mondo_id'],
                label=row['label'],
-                icd10=row['icd10'],
-                defaults={'label': row['label']}
-            )
-            if created:
-                count += 1
-        return count
+                icd10=row['icd10']
+            ))
+        models.Disorder.objects.bulk_create(bulk)
+        return len(bulk)

    def populate_ensg() -> int:
        """ Populates the Ensembl-Gene table in the django database.
@@ -53,14 +50,13 @@ class DataPopulator:
            int: Count of how many ensg-protein relations were added
        """
        data = DataLoader.load_ensg()
-        count = 0
+        bulk = list()
        for entrez, ensg_list in data.items():
            protein = models.Protein.objects.get(entrez=entrez)
            for ensg in ensg_list:
-                _, created = models.EnsemblGene.objects.get_or_create(name=ensg, protein=protein)
-                if created:
-                    count += 1
-        return count
+                bulk.append(models.EnsemblGene(name=ensg, protein=protein))
+        models.EnsemblGene.objects.bulk_create(bulk)
+        return len(bulk)

    def populate_ppi_string() -> int:
        """ Populates the Protein-Protein-Interactions from STRINGdb
@@ -75,7 +71,7 @@ class DataPopulator:
            link='https://string-db.org/',
            version='11.0'
        )
-        count = 0
+        bulk = list()
        for _, row in df.iterrows():
            try:
                # try fetching proteins
@@ -85,17 +81,16 @@ class DataPopulator:
                # continue if not found
                continue
            try:
-                _, created = models.ProteinProteinInteraction.objects.get_or_create(
+                bulk.append(models.ProteinProteinInteraction(
                    ppi_dataset=dataset,
                    from_protein=protein_a,
                    to_protein=protein_b
-                )
-                if created:
-                    count += 1
+                ))
            except models.ValidationError:
                # duplicate
                continue
-        return count
+        models.ProteinProteinInteraction.objects.bulk_create(bulk)
+        return len(bulk)

    def populate_ppi_apid() -> int:
        """ Populates the Protein-Protein-Interactions from Apid
@@ -110,7 +105,7 @@ class DataPopulator:
            link='http://cicblade.dep.usal.es:8080/APID/',
            version='January 2019'
        )
-        count = 0
+        bulk = list()
        for _, row in df.iterrows():
            try:
                # try fetching proteins
@@ -120,17 +115,16 @@ class DataPopulator:
                # continue if not found
                continue
            try:
-                _, created = models.ProteinProteinInteraction.objects.get_or_create(
+                bulk.append(models.ProteinProteinInteraction(
                    ppi_dataset=dataset,
                    from_protein=protein_a,
                    to_protein=protein_b
-                )
-                if created:
-                    count += 1
+                ))
            except models.ValidationError:
                # duplicate
                continue
-        return count
+        models.ProteinProteinInteraction.objects.bulk_create(bulk)
+        return len(bulk)

    def populate_ppi_biogrid() -> int:
        """ Populates the Protein-Protein-Interactions from BioGRID
@@ -145,7 +139,7 @@ class DataPopulator:
            link='https://thebiogrid.org/',
            version='4.0'
        )
-        count = 0
+        bulk = list()
        for _, row in df.iterrows():
            try:
                # try fetching proteins
@@ -155,17 +149,16 @@ class DataPopulator:
                # continue if not found
                continue
            try:
-                _, created = models.ProteinProteinInteraction.objects.get_or_create(
+                bulk.append(models.ProteinProteinInteraction(
                    ppi_dataset=dataset,
                    from_protein=protein_a,
                    to_protein=protein_b
-                )
-                if created:
-                    count += 1
+                ))
            except models.ValidationError:
                # duplicate
                continue
-        return count
+        models.ProteinProteinInteraction.objects.bulk_create(bulk)
+        return len(bulk)

    def populate_pdi_chembl() -> int:
        """ Populates the Protein-Drug-Interactions from Chembl
@@ -180,8 +173,8 @@ class DataPopulator:
            link='https://www.ebi.ac.uk/chembl/',
            version='27',
        )
-        count = 0
-        for index, row in df.iterrows():
+        bulk = list()
+        for _, row in df.iterrows():
            try:
                # try fetching protein
                protein = models.Protein.objects.get(uniprot_code=row['protein_ac'])
@@ -194,14 +187,13 @@ class DataPopulator:
            except models.Drug.DoesNotExist:
                # continue if not found
                continue
-            _, created = models.ProteinDrugInteraction.objects.get_or_create(
+            bulk.append(models.ProteinDrugInteraction(
                pdi_dataset=dataset,
                protein=protein,
                drug=drug
-            )
-            if created:
-                count += 1
-        return count
+            ))
+        models.ProteinProteinInteraction.objects.bulk_create(bulk)
+        return len(bulk)

    def populate_pdis_disgenet() -> int:
        """ Populates the Protein-Disorder-Interactions from DisGeNET
@@ -216,8 +208,8 @@ class DataPopulator:
            link='https://www.disgenet.org/home/',
            version='6.0',
        )
-        count = 0
-        for index, row in df.iterrows():
+        bulk = list()
+        for _, row in df.iterrows():
            try:
                # try fetching protein
                protein = models.Protein.objects.get(uniprot_code=row['protein_name'])
@@ -230,15 +222,14 @@ class DataPopulator:
            except models.Disorder.DoesNotExist:
                # continue if not found
                continue
-            _, created = models.ProteinDisorderAssociation.objects.get_or_create(
+            bulk.append(models.ProteinDisorderAssociation(
                pdis_dataset=dataset,
                protein=protein,
                disorder=disorder,
                score=row['score']
-            )
-            if created:
-                count += 1
-        return count
+            ))
+        models.ProteinDisorderAssociation.objects.bulk_create(bulk)
+        return len(bulk)

    def populate_drdis_drugbank() -> int:
        """ Populates the Drug-Disorder-Indications from DrugBank
@@ -253,8 +244,8 @@ class DataPopulator:
            link='https://go.drugbank.com/',
            version='5.1.8',
        )
-        count = 0
-        for index, row in df.iterrows():
+        bulk = list()
+        for _, row in df.iterrows():
            try:
                # try fetching protein
                drug = models.Drug.objects.get(drug_id=row['drugbank_id'])
@@ -267,14 +258,13 @@ class DataPopulator:
            except models.Disorder.DoesNotExist:
                # continue if not found
                continue
-            _, created = models.DrugDisorderIndication.objects.get_or_create(
+            bulk.append(models.DrugDisorderIndication(
                drdi_dataset=dataset,
                drug=drug,
                disorder=disorder,
-            )
-            if created:
-                count += 1
-        return count
+            ))
+        models.DrugDisorderIndication.objects.bulk_create(bulk)
+        return len(bulk)

    def populate_pdi_dgidb() -> int:
        """ Populates the Protein-Drug-Interactions from DGIdb
@@ -289,7 +279,7 @@ class DataPopulator:
            link='https://www.dgidb.org/',
            version='4.2.0'
        )
-        count = 0
+        bulk = list()
        for _, row in df.iterrows():
            try:
                # try fetching protein
@@ -303,14 +293,13 @@ class DataPopulator:
            except models.Drug.DoesNotExist:
                # continue if not found
                continue
-            _, created = models.ProteinDrugInteraction.objects.get_or_create(
+            bulk.append(models.ProteinDrugInteraction(
                pdi_dataset=dataset,
                protein=protein,
                drug=drug
-            )
-            if created:
-                count += 1
-        return count
+            ))
+        models.ProteinDrugInteraction.objects.bulk_create(bulk)
+        return len(bulk)

    def populate_pdi_drugbank() -> int:
        """ Populates the Protein-Drug-Interactions from Drugbank
@@ -325,7 +314,7 @@ class DataPopulator:
            link='https://go.drugbank.com/',
            version='5.1.7'
        )
-        count = 0
+        bulk = list()
        for _, row in df.iterrows():
            try:
                # try fetching protein
@@ -339,11 +328,10 @@ class DataPopulator:
            except models.Drug.DoesNotExist:
                # continue if not found
                continue
-            _, created = models.ProteinDrugInteraction.objects.get_or_create(
+            bulk.append(models.ProteinDrugInteraction(
                pdi_dataset=dataset,
                protein=protein,
                drug=drug
-            )
-            if created:
-                count += 1
-        return count
+            ))
+        models.ProteinDrugInteraction.objects.bulk_create(bulk)
+        return len(bulk)
--- a/drugstone/settings/celery_schedule.py
+++ b/drugstone/settings/celery_schedule.py
@@ -3,6 +3,6 @@ from celery.schedules import crontab
 CELERY_BEAT_SCHEDULE = {
    'update_db': {
        'task': 'drugstone.tasks.task_update_db_from_nedrex',
-        'schedule': crontab(minute='*/1000'),
+        'schedule': crontab(minute='*/1'),
    },
 }
--- a/drugstone/settings/settings.py
+++ b/drugstone/settings/settings.py
@@ -24,7 +24,7 @@ BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 SECRET_KEY = os.environ.get('SECRET_KEY')

 # SECURITY WARNING: don't run with debug turned on in production!
-DEBUG = os.environ.get('DEBUG', False)
+DEBUG = os.environ.get('DEBUG') == '1'

 ALLOWED_HOSTS = [
    'localhost',
@@ -45,6 +45,7 @@ INSTALLED_APPS = [
    'django.contrib.staticfiles',
    'corsheaders',
    'drugstone',
+    # 'python_nedrex',
    'rest_framework',
 ]


--- a/drugstone/tasks.py
+++ b/drugstone/tasks.py
@@ -11,8 +11,8 @@ def task_update_db_from_nedrex():
    print('here')

    logger.info('Fetching data...')
-    fetch_nedrex_data()
+    # fetch_nedrex_data()

    logger.info('Integrating data...')
-    integrate_nedrex_data()
+    # integrate_nedrex_data()
    logger.info('Done.')