db init cleanup and significant speedup

Former-commit-id: b98d1951d6d94f03ffe97486b492d734a2a4258e [formerly 88e844cb12d9d9e129f9b7f50856d11124a4a475] Former-commit-id: 66631e2e8e954e5144472bd8ffb170ed4858c30c

db init cleanup and significant speedup
ca0e5a7b · AndiMajore · e46ee3df · ca0e5a7b · ca0e5a7b · ca0e5a7b
Commit ca0e5a7b authored 2 years ago by AndiMajore
--- a/data_drugstone/drug-file.txt
+++ b/data_drugstone/drug-file.txt
--- a/data_drugstone/PDI/chembl_drug_gene_interactions_uniq.csv
+++ b/data_drugstone/PDI/chembl_drug_gene_interactions_uniq.csv
--- a/data_drugstone/PDI/drugbank_drug_gene_interactions_uniq.csv
+++ b/data_drugstone/PDI/drugbank_drug_gene_interactions_uniq.csv
--- a/drugstone/management/commands/populate_db.py
+++ b/drugstone/management/commands/populate_db.py
@@ -2,7 +2,7 @@ from django.core.management.base import BaseCommand
 import pandas as pd
 from django.db import OperationalError, IntegrityError

-from drugstone.models import Protein, Drug, Tissue, ExpressionLevel, PPIDataset, PDIDataset, Disorder, PDisDataset
+from drugstone.models import Protein, Drug, Tissue, ExpressionLevel, PPIDataset, PDIDataset, Disorder, PDisDataset, DrDiDataset
 from drugstone.models import ProteinProteinInteraction, ProteinDrugInteraction

 from drugstone.management.includes.DataPopulator import DataPopulator
@@ -57,55 +57,6 @@ class DatabasePopulator:
                self.delete_model(DrDiDataset)


-    def populate_drug_model(self):
-        print('Populating Drug model ...')
-        drug_df = pd.read_csv(f'{self.data_dir}/data_drugstone/{self.drug_file}', delimiter='\t')
-        for _, row in drug_df.iterrows():
-            drug_id = row['drug_id']
-            drug_name = row['drug_name']
-            drug_status = row['drug_status']
-            # links = row['links']
-            Drug.objects.update_or_create(
-                drug_id=drug_id, 
-                name=drug_name, 
-                status=drug_status, 
-                # links=links
-                )
-
-        print('Done!\n')
-
-
-    def populate_exp_model(self):
-        print('Populating Tissue and ExpressionLevel model ...')
-        exp_df = pd.read_csv(f'{self.data_dir}/data_drugstone/{self.exp_file}', delimiter='\t')
-
-        tissues_models = dict()
-        for tissue_name in exp_df.columns.values[2:]:
-            try:
-                tissue_model = Tissue.objects.get(name=tissue_name)
-            except Tissue.DoesNotExist:
-                tissue_model = Tissue.objects.create(name=tissue_name)
-            tissues_models[tissue_name] = tissue_model
-
-        proteins_linked = 0
-
-        for _, row in exp_df.iterrows():
-            gene_name = row['Description']
-
-            for protein_model in Protein.objects.filter(gene=gene_name).all():
-                proteins_linked += 1
-
-                for tissue_name, tissue_model in tissues_models.items():
-                    try:
-                        ExpressionLevel.objects.create(protein=protein_model,
-                                                       tissue=tissue_model,
-                                                       expression_level=row[tissue_name])
-                    except IntegrityError:
-                        pass
-
-        print(f'Added {proteins_linked} expression levels!\n')
-
-

 class Command(BaseCommand):
    def add_arguments(self, parser):
@@ -139,6 +90,7 @@ class Command(BaseCommand):
        pp = kwargs['protein_protein']
        pd = kwargs['protein_drug']

+
        db_populator = DatabasePopulator(data_dir=data_dir,
                                        # protein_file=protein_file,
                                        drug_file=drug_file,
@@ -152,8 +104,12 @@ class Command(BaseCommand):
            db_populator.delete_models(model_list)
            return

+        populator = DataPopulator()
+
        if kwargs['drug_file'] is not None:
-            db_populator.populate_drug_model()
+            print('Populating Drugs...')
+            n = DataPopulator.populate_drugs(populator)
+            print(f'Populated {n} Drugs.')

        # if kwargs['protein_file'] is not None:
        #     db_poulator.populate_protein_model()
@@ -165,52 +121,54 @@ class Command(BaseCommand):
        #     db_poulator.populate_ppi_model()

        if kwargs['exp_file'] is not None:
-            db_populator.populate_exp_model()
+            print('Populating Expressions...')
+            n = DataPopulator.populate_expessions(populator)
+            print(f'Populated {n} Expressions.')

        if kwargs['proteins'] is not None:
            print('Populating Proteins...')
-            n = DataPopulator.populate_proteins()
+            n = DataPopulator.populate_proteins(populator)
            print(f'Populated {n} Proteins.')
            
            print('Populating ENSG IDs...')
-            n = DataPopulator.populate_ensg()
+            n = DataPopulator.populate_ensg(populator)
            print(f'Populated {n} ENSG IDs.')

        if kwargs['disorders'] is not None:
            print('Populating Disorders...')
-            n = DataPopulator.populate_disorders()
+            n = DataPopulator.populate_disorders(populator)
            print(f'Populated {n} Disorders.')

        if kwargs['protein_protein'] is not None:
            print('Populating PPIs from STRING...')
-            n = DataPopulator.populate_ppi_string()
+            n = DataPopulator.populate_ppi_string(populator)
            print(f'Populated {n} PPIs from STRING.')

            print('Populating PPIs from APID...')
-            n = DataPopulator.populate_ppi_apid()
+            n = DataPopulator.populate_ppi_apid(populator)
            print(f'Populated {n} PPIs from APID.')

            print('Populating PPIs from BioGRID...')
-            n = DataPopulator.populate_ppi_biogrid()
+            n = DataPopulator.populate_ppi_biogrid(populator)
            print(f'Populated {n} PPIs from BioGRID.')

        if kwargs['protein_drug'] is not None:
            print('Populating PDIs from Chembl...')
-            n = DataPopulator.populate_pdi_chembl()
+            n = DataPopulator.populate_pdi_chembl(populator)
            print(f'Populated {n} PDIs from Chembl.')

            print('Populating PDIs from DGIdb...')
-            n = DataPopulator.populate_pdi_dgidb() 
+            n = DataPopulator.populate_pdi_dgidb(populator)
            print(f'Populated {n} PDIs from DGIdb.')

            print('Populating PDIs from DrugBank...')
-            n = DataPopulator.populate_pdi_drugbank()
+            n = DataPopulator.populate_pdi_drugbank(populator)
            print(f'Populated {n} PDIs from DrugBank.')
        if kwargs['protein_disorder'] is not None:
            print('Populating PDis associations from DisGeNET...')
-            n=DataPopulator.populate_pdis_disgenet()
+            n=DataPopulator.populate_pdis_disgenet(populator)
            print(f'Populated {n} PDis associations from DisGeNET.')
        if kwargs['drug_disorder'] is not None:
            print('Populating DrDi indications from DrugBank...')
-            n=DataPopulator.populate_drdis_drugbank()
+            n=DataPopulator.populate_drdis_drugbank(populator)
            print(f'Populated {n} DrDi associations from DrugBank.')
--- a/drugstone/management/includes/DataLoader.py
+++ b/drugstone/management/includes/DataLoader.py
@@ -4,6 +4,8 @@ import json

 class DataLoader:
    PATH_PROTEINS = 'data_drugstone/Proteins/'
+    PATH_DRUGS = 'data_drugstone/Drugs/'
+    PATH_EXPR = 'data_drugstone/'
    PATH_DISORDERS = 'data_drugstone/Disorders/'
    PATH_PDI = 'data_drugstone/PDI/'
    PATH_PPI = 'data_drugstone/PPI/'
@@ -16,15 +18,20 @@ class DataLoader:

    # Disorders
    DISORDERS_MONDO = 'disorders.tsv'
+    #Drugs
+    DRUG_FILE = 'drug-file.txt'
+
+    #Expressions
+    EXPR_FILE = 'gene_tissue_expression.gct'

    # Protein-Protein-Interactions
    PPI_APID = 'apid_9606_Q2.txt'
    PPI_BIOGRID = 'BIOGRID-ORGANISM-Homo_sapiens-3.5.187.mitab.txt'
    PPI_STRING = 'string_interactions.csv'
    # Protein-Drug-Interactions
-    PDI_CHEMBL = 'chembl_drug_gene_interactions.csv'
+    PDI_CHEMBL = 'chembl_drug_gene_interactions_uniq.csv'
    PDI_DGIDB = 'DGIdb_drug_gene_interactions.csv'
-    PDI_DRUGBANK = 'drugbank_drug_gene_interactions.csv'
+    PDI_DRUGBANK = 'drugbank_drug_gene_interactions_uniq.csv'

    # Protein-Disorder-Interaction
    PDi_DISGENET = 'disgenet-protein_disorder_association.tsv'
@@ -59,6 +66,14 @@ class DataLoader:
        df = pd.read_csv(f'{DataLoader.PATH_PROTEINS}{DataLoader.PROTEINS_COVEX}')
        df['entrez_id'] = df['entrez_id'].map(DataLoader._clean_entrez)
        return df
+    @staticmethod
+    def load_drugs()-> pd.DataFrame:
+        return pd.read_csv(f'{DataLoader.PATH_DRUGS}{DataLoader.DRUG_FILE}', sep='\t')
+
+    @staticmethod
+    def load_expressions() -> pd.DataFrame:
+        return pd.read_csv(f'{DataLoader.PATH_EXPR}{DataLoader.EXPR_FILE}', sep='\t')
+

    @staticmethod
    def load_disorders() -> pd.DataFrame:
@@ -206,7 +221,7 @@ class DataLoader:
        Returns:
            pd.DataFrame: columns "drug_id" and "protein_ac"
        """
-        return pd.read_csv(f'{DataLoader.PATH_PDI}{DataLoader.PDI_CHEMBL}', index_col=0)
+        return pd.read_csv(f'{DataLoader.PATH_PDI}{DataLoader.PDI_CHEMBL}')

    @staticmethod
    def load_pdis_disgenet() -> pd.DataFrame:
@@ -244,6 +259,6 @@ class DataLoader:
        Returns:
            pd.DataFrame: columns "drug_id" and "entrez_id"
        """
-        df = pd.read_csv(f'{DataLoader.PATH_PDI}{DataLoader.PDI_DRUGBANK}', index_col=0).dropna()
+        df = pd.read_csv(f'{DataLoader.PATH_PDI}{DataLoader.PDI_DRUGBANK}').dropna()
        df['entrez_id'] = df['entrez_id'].map(DataLoader._clean_entrez)
        return df
--- a/drugstone/management/includes/DataPopulator.py
+++ b/drugstone/management/includes/DataPopulator.py
+from collections import defaultdict
+
 from drugstone.management.includes.DataLoader import DataLoader
 import drugstone.models as models


 class DataPopulator:
+    proteins = dict()
+    uniprot_to_ensembl = dict()
+    gene_name_to_ensembl = defaultdict(lambda: set())
+    # protein_name_to_ensembl = dict()
+    disorders = dict()
+    drugs = dict()
+
+    def init_proteins(self):
+        if len(self.proteins) == 0:
+            print("Generating protein maps...")
+            for protein in models.Protein.objects.all():
+                self.proteins[protein.entrez]=protein
+                self.uniprot_to_ensembl[protein.uniprot_code] = protein.entrez
+                self.gene_name_to_ensembl[protein.gene].add(protein.entrez)
+                # self.protein_name_to_ensembl[protein.protein_name] = protein.entrez
+
+    def init_drugs(self):
+        if len(self.drugs)== 0:
+            print("Generating drug map...")
+            for drug in models.Drug.objects.all():
+                self.drugs[drug.drug_id]=drug

-    def populate_proteins() -> int:
+    def init_disorders(self):
+        if len(self.disorders) == 0:
+            print("Generating disorder map...")
+            for disorder in models.Disorder.objects.all():
+                self.disorders[disorder.mondo_id]=disorder
+
+    def populate_proteins(self) -> int:
        """ Populates the Protein table in the django database.
        Handles loading the data and passing it to the django database

@@ -12,18 +41,20 @@ class DataPopulator:
            int: Count of how many proteins were added
        """
        df = DataLoader.load_proteins()
-        proteins = list()
        for _, row in df.iterrows():
-            proteins.append(models.Protein(
+            self.proteins[row['entrez_id']] = models.Protein(
                uniprot_code=row['protein_ac'],
                gene=row['gene_name'],
                entrez=row['entrez_id'],
                protein_name=row['protein_name'])
-            )
-        models.Protein.objects.bulk_create(proteins)
-        return len(proteins)
+            self.uniprot_to_ensembl[row['protein_ac']] = row['entrez_id']
+            self.gene_name_to_ensembl[row['gene_name']].add(row['entrez_id'])
+            # self.protein_name_to_ensembl[row['protein_name']] = row['entrez_id']
+
+        models.Protein.objects.bulk_create(self.proteins.values())
+        return len(self.proteins)

-    def populate_disorders() -> int:
+    def populate_disorders(self) -> int:
        """ Populates the Disorder table in the django database.
        Handles loading the data and passing it to the django database

@@ -31,17 +62,63 @@ class DataPopulator:
            int: Count of how many disorders were added
        """
        df = DataLoader.load_disorders()
-        bulk = list()
        for _, row in df.iterrows():
-            bulk.append(models.Disorder(
+            self.disorders[row['mondo_id']] = models.Disorder(
                mondo_id=row['mondo_id'],
                label=row['label'],
                icd10=row['icd10']
-            ))
-        models.Disorder.objects.bulk_create(bulk)
+            )
+        models.Disorder.objects.bulk_create(self.disorders.values())
+        return len(self.disorders)
+
+    def populate_drugs(self):
+        df = DataLoader.load_drugs()
+        for _, row in df.iterrows():
+            drug_id = row['drug_id']
+            drug_name = row['drug_name']
+            drug_status = row['drug_status']
+            self.drugs[drug_id] = models.Drug(
+                drug_id=drug_id,
+                name=drug_name,
+                status=drug_status)
+        models.Drug.objects.bulk_create(self.drugs.values())
+        return len(self.drugs)
+
+    def populate_expessions(self):
+        self.init_proteins()
+        df = DataLoader.load_expressions()
+
+        tissues_models = dict()
+        for tissue_name in df.columns.values[2:]:
+            try:
+                tissue_model = models.Tissue.objects.get(name=tissue_name)
+            except models.Tissue.DoesNotExist:
+                tissue_model = models.Tissue.objects.create(name=tissue_name)
+            tissues_models[tissue_name] = tissue_model
+
+        proteins_linked = 0
+        unique = set()
+        bulk = list()
+
+        for _, row in df.iterrows():
+            gene_name = row['Description']
+
+            for protein_id in self.gene_name_to_ensembl[gene_name]:
+                protein_model = self.proteins[protein_id]
+                proteins_linked += 1
+
+                for tissue_name, tissue_model in tissues_models.items():
+                    id = f"{tissue_name}_{protein_id}"
+                    if id in unique:
+                        continue
+                    unique.add(id)
+                    bulk.append(models.ExpressionLevel(protein=protein_model,
+                                                       tissue=tissue_model,
+                                                       expression_level=row[tissue_name]))
+        models.ExpressionLevel.objects.bulk_create(bulk)
        return len(bulk)

-    def populate_ensg() -> int:
+    def populate_ensg(self) -> int:
        """ Populates the Ensembl-Gene table in the django database.
        Also maps the added ensg entries to the corresponding proteins.
        Handles loading the data and passing it to the django database
@@ -49,22 +126,24 @@ class DataPopulator:
        Returns:
            int: Count of how many ensg-protein relations were added
        """
+        self.init_proteins()
        data = DataLoader.load_ensg()
        bulk = list()
        for entrez, ensg_list in data.items():
-            protein = models.Protein.objects.get(entrez=entrez)
+            protein = self.proteins[entrez]
            for ensg in ensg_list:
                bulk.append(models.EnsemblGene(name=ensg, protein=protein))
        models.EnsemblGene.objects.bulk_create(bulk)
        return len(bulk)

-    def populate_ppi_string() -> int:
+    def populate_ppi_string(self) -> int:
        """ Populates the Protein-Protein-Interactions from STRINGdb
        Handles loading the data and passing it to the django database

        Returns:
            int: Count of how many interactions were added
        """
+        self.init_proteins()
        df = DataLoader.load_ppi_string()
        dataset, _ = models.PPIDataset.objects.get_or_create(
            name='STRING',
@@ -75,9 +154,9 @@ class DataPopulator:
        for _, row in df.iterrows():
            try:
                # try fetching proteins
-                protein_a = models.Protein.objects.get(entrez=row['entrez_a'])
-                protein_b = models.Protein.objects.get(entrez=row['entrez_b'])
-            except models.Protein.DoesNotExist:
+                protein_a = self.proteins[row['entrez_a']]
+                protein_b = self.proteins[row['entrez_b']]
+            except KeyError:
                # continue if not found
                continue
            try:
@@ -92,13 +171,14 @@ class DataPopulator:
        models.ProteinProteinInteraction.objects.bulk_create(bulk)
        return len(bulk)

-    def populate_ppi_apid() -> int:
+    def populate_ppi_apid(self) -> int:
        """ Populates the Protein-Protein-Interactions from Apid
        Handles loading the data and passing it to the django database

        Returns:
            int: Count of how many interactions were added
        """
+        self.init_proteins()
        df = DataLoader.load_ppi_apid()
        dataset, _ = models.PPIDataset.objects.get_or_create(
            name='APID',
@@ -109,9 +189,9 @@ class DataPopulator:
        for _, row in df.iterrows():
            try:
                # try fetching proteins
-                protein_a = models.Protein.objects.get(uniprot_code=row['from_protein_ac'])
-                protein_b = models.Protein.objects.get(uniprot_code=row['to_protein_ac'])
-            except models.Protein.DoesNotExist:
+                protein_a = self.proteins[self.uniprot_to_ensembl[row['from_protein_ac']]]
+                protein_b = self.proteins[self.uniprot_to_ensembl[row['to_protein_ac']]]
+            except KeyError:
                # continue if not found
                continue
            try:
@@ -121,18 +201,18 @@ class DataPopulator:
                    to_protein=protein_b
                ))
            except models.ValidationError:
-                # duplicate
                continue
        models.ProteinProteinInteraction.objects.bulk_create(bulk)
        return len(bulk)

-    def populate_ppi_biogrid() -> int:
+    def populate_ppi_biogrid(self) -> int:
        """ Populates the Protein-Protein-Interactions from BioGRID
        Handles loading the data and passing it to the django database

        Returns:
            int: Count of how many interactions were added
        """
+        self.init_proteins()
        df = DataLoader.load_ppi_biogrid()
        dataset, _ = models.PPIDataset.objects.get_or_create(
            name='BioGRID',
@@ -143,9 +223,10 @@ class DataPopulator:
        for _, row in df.iterrows():
            try:
                # try fetching proteins
-                protein_a = models.Protein.objects.get(entrez=row['entrez_a'])
-                protein_b = models.Protein.objects.get(entrez=row['entrez_b'])
-            except models.Protein.DoesNotExist:
+                protein_a = self.proteins[row['entrez_a']]
+                protein_b = self.proteins[row['entrez_b']]
+            except KeyError:
+                # TODO update error
                # continue if not found
                continue
            try:
@@ -160,13 +241,15 @@ class DataPopulator:
        models.ProteinProteinInteraction.objects.bulk_create(bulk)
        return len(bulk)

-    def populate_pdi_chembl() -> int:
+    def populate_pdi_chembl(self) -> int:
        """ Populates the Protein-Drug-Interactions from Chembl
        Handles Loading the data and passing it to the django database

        Returns:
            int: Count of how many interactions were added
        """
+        self.init_proteins()
+        self.init_drugs()
        df = DataLoader.load_pdi_chembl()
        dataset, _ = models.PDIDataset.objects.get_or_create(
            name='ChEMBL',
@@ -176,15 +259,14 @@ class DataPopulator:
        bulk = list()
        for _, row in df.iterrows():
            try:
-                # try fetching protein
-                protein = models.Protein.objects.get(uniprot_code=row['protein_ac'])
-            except models.Protein.DoesNotExist:
+                protein = self.proteins[self.uniprot_to_ensembl[row['protein_ac']]]
+            except KeyError:
                # continue if not found
                continue
            try:
                # try fetching drug
-                drug = models.Drug.objects.get(drug_id=row['drug_id'])
-            except models.Drug.DoesNotExist:
+                drug = self.drugs[row['drug_id']]
+            except KeyError:
                # continue if not found
                continue
            bulk.append(models.ProteinDrugInteraction(
@@ -195,13 +277,15 @@ class DataPopulator:
        models.ProteinDrugInteraction.objects.bulk_create(bulk)
        return len(bulk)

-    def populate_pdis_disgenet() -> int:
+    def populate_pdis_disgenet(self,) -> int:
        """ Populates the Protein-Disorder-Interactions from DisGeNET
        Handles Loading the data and passing it to the django database

        Returns:
            int: Count of how many interactions were added
        """
+        self.init_proteins()
+        self.init_disorders()
        df = DataLoader.load_pdis_disgenet()
        dataset, _ = models.PDisDataset.objects.get_or_create(
            name='DisGeNET',
@@ -212,14 +296,14 @@ class DataPopulator:
        for _, row in df.iterrows():
            try:
                # try fetching protein
-                protein = models.Protein.objects.get(uniprot_code=row['protein_name'])
-            except models.Protein.DoesNotExist:
+                protein = self.proteins[self.uniprot_to_ensembl[row['protein_name']]]
+            except KeyError:
                # continue if not found
                continue
            try:
                # try fetching drug
-                disorder = models.Disorder.objects.get(mondo_id=row['disorder_name'])
-            except models.Disorder.DoesNotExist:
+                disorder = self.disorders[str(int(row['disorder_name']))]
+            except KeyError:
                # continue if not found
                continue
            bulk.append(models.ProteinDisorderAssociation(
@@ -231,13 +315,15 @@ class DataPopulator:
        models.ProteinDisorderAssociation.objects.bulk_create(bulk)
        return len(bulk)

-    def populate_drdis_drugbank() -> int:
+    def populate_drdis_drugbank(self) -> int:
        """ Populates the Drug-Disorder-Indications from DrugBank
        Handles Loading the data and passing it to the django database

        Returns:
            int: Count of how many edges were added
        """
+        self.init_drugs()
+        self.init_disorders()
        df = DataLoader.load_drdis_drugbank()
        dataset, _ = models.DrDiDataset.objects.get_or_create(
            name='DrugBank',
@@ -248,14 +334,14 @@ class DataPopulator:
        for _, row in df.iterrows():
            try:
                # try fetching protein
-                drug = models.Drug.objects.get(drug_id=row['drugbank_id'])
-            except models.Drug.DoesNotExist:
+                drug = self.drugs[row['drugbank_id']]
+            except KeyError:
                # continue if not found
                continue
            try:
                # try fetching drug
-                disorder = models.Disorder.objects.get(mondo_id=row['mondo_id'])
-            except models.Disorder.DoesNotExist:
+                disorder = self.disorders[str(int(row['mondo_id']))]
+            except KeyError:
                # continue if not found
                continue
            bulk.append(models.DrugDisorderIndication(
@@ -266,13 +352,15 @@ class DataPopulator:
        models.DrugDisorderIndication.objects.bulk_create(bulk)
        return len(bulk)

-    def populate_pdi_dgidb() -> int:
+    def populate_pdi_dgidb(self) -> int:
        """ Populates the Protein-Drug-Interactions from DGIdb
        Handles Loading the data and passing it to the django database

        Returns:
            int: Count of how many interactions were added
        """
+        self.init_proteins()
+        self.init_drugs()
        df = DataLoader.load_pdi_dgidb()
        dataset, _ = models.PDIDataset.objects.get_or_create(
            name='DGIdb',
@@ -283,14 +371,14 @@ class DataPopulator:
        for _, row in df.iterrows():
            try:
                # try fetching protein
-                protein = models.Protein.objects.get(entrez=row['entrez_id'])
-            except models.Protein.DoesNotExist:
+                protein = self.proteins[row['entrez_id']]
+            except KeyError:
                # continue if not found
                continue
            try:
                # try fetching drug
-                drug = models.Drug.objects.get(drug_id=row['drug_id'])
-            except models.Drug.DoesNotExist:
+                drug = self.drugs[row['drug_id']]
+            except KeyError:
                # continue if not found
                continue
            bulk.append(models.ProteinDrugInteraction(
@@ -301,13 +389,15 @@ class DataPopulator:
        models.ProteinDrugInteraction.objects.bulk_create(bulk)
        return len(bulk)

-    def populate_pdi_drugbank() -> int:
+    def populate_pdi_drugbank(self) -> int:
        """ Populates the Protein-Drug-Interactions from Drugbank
        Handles Loading the data and passing it to the django database

        Returns:
            int: Count of how many interactions were added
        """
+        self.init_proteins()
+        self.init_drugs()
        df = DataLoader.load_pdi_drugbank()
        dataset, _ = models.PDIDataset.objects.get_or_create(
            name='DrugBank',
@@ -318,14 +408,14 @@ class DataPopulator:
        for _, row in df.iterrows():
            try:
                # try fetching protein
-                protein = models.Protein.objects.get(entrez=row['entrez_id'])
-            except models.Protein.DoesNotExist:
+                protein = self.proteins[row['entrez_id']]
+            except KeyError:
                # continue if not found
                continue
            try:
                # try fetching drug
-                drug = models.Drug.objects.get(drug_id=row['drug_id'])
-            except models.Drug.DoesNotExist:
+                drug = self.drugs[row['drug_id']]
+            except KeyError:
                # continue if not found
                continue
            bulk.append(models.ProteinDrugInteraction(

--- a/drugstone/views.py
+++ b/drugstone/views.py
@@ -78,6 +78,7 @@ class ProteinDrugInteractionView(APIView):

    def get(self, request) -> Response:
        if request.query_params.get('proteins'):
+            print("getting drugs for proteins")
            protein_ac_list = json.loads(request.query_params.get('proteins'))
            proteins = list(Protein.objects.filter(uniprot_code__in=protein_ac_list).all())
        else: