Skip to content
Snippets Groups Projects
Commit ca0e5a7b authored by AndiMajore's avatar AndiMajore
Browse files

db init cleanup and significant speedup

Former-commit-id: b98d1951d6d94f03ffe97486b492d734a2a4258e [formerly 88e844cb12d9d9e129f9b7f50856d11124a4a475]
Former-commit-id: 66631e2e8e954e5144472bd8ffb170ed4858c30c
parent e46ee3df
No related branches found
No related tags found
No related merge requests found
File moved
This diff is collapsed.
This diff is collapsed.
......@@ -2,7 +2,7 @@ from django.core.management.base import BaseCommand
import pandas as pd
from django.db import OperationalError, IntegrityError
from drugstone.models import Protein, Drug, Tissue, ExpressionLevel, PPIDataset, PDIDataset, Disorder, PDisDataset
from drugstone.models import Protein, Drug, Tissue, ExpressionLevel, PPIDataset, PDIDataset, Disorder, PDisDataset, DrDiDataset
from drugstone.models import ProteinProteinInteraction, ProteinDrugInteraction
from drugstone.management.includes.DataPopulator import DataPopulator
......@@ -57,55 +57,6 @@ class DatabasePopulator:
self.delete_model(DrDiDataset)
def populate_drug_model(self):
print('Populating Drug model ...')
drug_df = pd.read_csv(f'{self.data_dir}/data_drugstone/{self.drug_file}', delimiter='\t')
for _, row in drug_df.iterrows():
drug_id = row['drug_id']
drug_name = row['drug_name']
drug_status = row['drug_status']
# links = row['links']
Drug.objects.update_or_create(
drug_id=drug_id,
name=drug_name,
status=drug_status,
# links=links
)
print('Done!\n')
def populate_exp_model(self):
print('Populating Tissue and ExpressionLevel model ...')
exp_df = pd.read_csv(f'{self.data_dir}/data_drugstone/{self.exp_file}', delimiter='\t')
tissues_models = dict()
for tissue_name in exp_df.columns.values[2:]:
try:
tissue_model = Tissue.objects.get(name=tissue_name)
except Tissue.DoesNotExist:
tissue_model = Tissue.objects.create(name=tissue_name)
tissues_models[tissue_name] = tissue_model
proteins_linked = 0
for _, row in exp_df.iterrows():
gene_name = row['Description']
for protein_model in Protein.objects.filter(gene=gene_name).all():
proteins_linked += 1
for tissue_name, tissue_model in tissues_models.items():
try:
ExpressionLevel.objects.create(protein=protein_model,
tissue=tissue_model,
expression_level=row[tissue_name])
except IntegrityError:
pass
print(f'Added {proteins_linked} expression levels!\n')
class Command(BaseCommand):
def add_arguments(self, parser):
......@@ -139,6 +90,7 @@ class Command(BaseCommand):
pp = kwargs['protein_protein']
pd = kwargs['protein_drug']
db_populator = DatabasePopulator(data_dir=data_dir,
# protein_file=protein_file,
drug_file=drug_file,
......@@ -152,8 +104,12 @@ class Command(BaseCommand):
db_populator.delete_models(model_list)
return
populator = DataPopulator()
if kwargs['drug_file'] is not None:
db_populator.populate_drug_model()
print('Populating Drugs...')
n = DataPopulator.populate_drugs(populator)
print(f'Populated {n} Drugs.')
# if kwargs['protein_file'] is not None:
# db_poulator.populate_protein_model()
......@@ -165,52 +121,54 @@ class Command(BaseCommand):
# db_poulator.populate_ppi_model()
if kwargs['exp_file'] is not None:
db_populator.populate_exp_model()
print('Populating Expressions...')
n = DataPopulator.populate_expessions(populator)
print(f'Populated {n} Expressions.')
if kwargs['proteins'] is not None:
print('Populating Proteins...')
n = DataPopulator.populate_proteins()
n = DataPopulator.populate_proteins(populator)
print(f'Populated {n} Proteins.')
print('Populating ENSG IDs...')
n = DataPopulator.populate_ensg()
n = DataPopulator.populate_ensg(populator)
print(f'Populated {n} ENSG IDs.')
if kwargs['disorders'] is not None:
print('Populating Disorders...')
n = DataPopulator.populate_disorders()
n = DataPopulator.populate_disorders(populator)
print(f'Populated {n} Disorders.')
if kwargs['protein_protein'] is not None:
print('Populating PPIs from STRING...')
n = DataPopulator.populate_ppi_string()
n = DataPopulator.populate_ppi_string(populator)
print(f'Populated {n} PPIs from STRING.')
print('Populating PPIs from APID...')
n = DataPopulator.populate_ppi_apid()
n = DataPopulator.populate_ppi_apid(populator)
print(f'Populated {n} PPIs from APID.')
print('Populating PPIs from BioGRID...')
n = DataPopulator.populate_ppi_biogrid()
n = DataPopulator.populate_ppi_biogrid(populator)
print(f'Populated {n} PPIs from BioGRID.')
if kwargs['protein_drug'] is not None:
print('Populating PDIs from Chembl...')
n = DataPopulator.populate_pdi_chembl()
n = DataPopulator.populate_pdi_chembl(populator)
print(f'Populated {n} PDIs from Chembl.')
print('Populating PDIs from DGIdb...')
n = DataPopulator.populate_pdi_dgidb()
n = DataPopulator.populate_pdi_dgidb(populator)
print(f'Populated {n} PDIs from DGIdb.')
print('Populating PDIs from DrugBank...')
n = DataPopulator.populate_pdi_drugbank()
n = DataPopulator.populate_pdi_drugbank(populator)
print(f'Populated {n} PDIs from DrugBank.')
if kwargs['protein_disorder'] is not None:
print('Populating PDis associations from DisGeNET...')
n=DataPopulator.populate_pdis_disgenet()
n=DataPopulator.populate_pdis_disgenet(populator)
print(f'Populated {n} PDis associations from DisGeNET.')
if kwargs['drug_disorder'] is not None:
print('Populating DrDi indications from DrugBank...')
n=DataPopulator.populate_drdis_drugbank()
n=DataPopulator.populate_drdis_drugbank(populator)
print(f'Populated {n} DrDi associations from DrugBank.')
......@@ -4,6 +4,8 @@ import json
class DataLoader:
PATH_PROTEINS = 'data_drugstone/Proteins/'
PATH_DRUGS = 'data_drugstone/Drugs/'
PATH_EXPR = 'data_drugstone/'
PATH_DISORDERS = 'data_drugstone/Disorders/'
PATH_PDI = 'data_drugstone/PDI/'
PATH_PPI = 'data_drugstone/PPI/'
......@@ -16,15 +18,20 @@ class DataLoader:
# Disorders
DISORDERS_MONDO = 'disorders.tsv'
#Drugs
DRUG_FILE = 'drug-file.txt'
#Expressions
EXPR_FILE = 'gene_tissue_expression.gct'
# Protein-Protein-Interactions
PPI_APID = 'apid_9606_Q2.txt'
PPI_BIOGRID = 'BIOGRID-ORGANISM-Homo_sapiens-3.5.187.mitab.txt'
PPI_STRING = 'string_interactions.csv'
# Protein-Drug-Interactions
PDI_CHEMBL = 'chembl_drug_gene_interactions.csv'
PDI_CHEMBL = 'chembl_drug_gene_interactions_uniq.csv'
PDI_DGIDB = 'DGIdb_drug_gene_interactions.csv'
PDI_DRUGBANK = 'drugbank_drug_gene_interactions.csv'
PDI_DRUGBANK = 'drugbank_drug_gene_interactions_uniq.csv'
# Protein-Disorder-Interaction
PDi_DISGENET = 'disgenet-protein_disorder_association.tsv'
......@@ -59,6 +66,14 @@ class DataLoader:
df = pd.read_csv(f'{DataLoader.PATH_PROTEINS}{DataLoader.PROTEINS_COVEX}')
df['entrez_id'] = df['entrez_id'].map(DataLoader._clean_entrez)
return df
@staticmethod
def load_drugs()-> pd.DataFrame:
return pd.read_csv(f'{DataLoader.PATH_DRUGS}{DataLoader.DRUG_FILE}', sep='\t')
@staticmethod
def load_expressions() -> pd.DataFrame:
return pd.read_csv(f'{DataLoader.PATH_EXPR}{DataLoader.EXPR_FILE}', sep='\t')
@staticmethod
def load_disorders() -> pd.DataFrame:
......@@ -206,7 +221,7 @@ class DataLoader:
Returns:
pd.DataFrame: columns "drug_id" and "protein_ac"
"""
return pd.read_csv(f'{DataLoader.PATH_PDI}{DataLoader.PDI_CHEMBL}', index_col=0)
return pd.read_csv(f'{DataLoader.PATH_PDI}{DataLoader.PDI_CHEMBL}')
@staticmethod
def load_pdis_disgenet() -> pd.DataFrame:
......@@ -244,6 +259,6 @@ class DataLoader:
Returns:
pd.DataFrame: columns "drug_id" and "entrez_id"
"""
df = pd.read_csv(f'{DataLoader.PATH_PDI}{DataLoader.PDI_DRUGBANK}', index_col=0).dropna()
df = pd.read_csv(f'{DataLoader.PATH_PDI}{DataLoader.PDI_DRUGBANK}').dropna()
df['entrez_id'] = df['entrez_id'].map(DataLoader._clean_entrez)
return df
from collections import defaultdict
from drugstone.management.includes.DataLoader import DataLoader
import drugstone.models as models
class DataPopulator:
proteins = dict()
uniprot_to_ensembl = dict()
gene_name_to_ensembl = defaultdict(lambda: set())
# protein_name_to_ensembl = dict()
disorders = dict()
drugs = dict()
def init_proteins(self):
if len(self.proteins) == 0:
print("Generating protein maps...")
for protein in models.Protein.objects.all():
self.proteins[protein.entrez]=protein
self.uniprot_to_ensembl[protein.uniprot_code] = protein.entrez
self.gene_name_to_ensembl[protein.gene].add(protein.entrez)
# self.protein_name_to_ensembl[protein.protein_name] = protein.entrez
def init_drugs(self):
if len(self.drugs)== 0:
print("Generating drug map...")
for drug in models.Drug.objects.all():
self.drugs[drug.drug_id]=drug
def populate_proteins() -> int:
def init_disorders(self):
if len(self.disorders) == 0:
print("Generating disorder map...")
for disorder in models.Disorder.objects.all():
self.disorders[disorder.mondo_id]=disorder
def populate_proteins(self) -> int:
""" Populates the Protein table in the django database.
Handles loading the data and passing it to the django database
......@@ -12,18 +41,20 @@ class DataPopulator:
int: Count of how many proteins were added
"""
df = DataLoader.load_proteins()
proteins = list()
for _, row in df.iterrows():
proteins.append(models.Protein(
self.proteins[row['entrez_id']] = models.Protein(
uniprot_code=row['protein_ac'],
gene=row['gene_name'],
entrez=row['entrez_id'],
protein_name=row['protein_name'])
)
models.Protein.objects.bulk_create(proteins)
return len(proteins)
self.uniprot_to_ensembl[row['protein_ac']] = row['entrez_id']
self.gene_name_to_ensembl[row['gene_name']].add(row['entrez_id'])
# self.protein_name_to_ensembl[row['protein_name']] = row['entrez_id']
models.Protein.objects.bulk_create(self.proteins.values())
return len(self.proteins)
def populate_disorders() -> int:
def populate_disorders(self) -> int:
""" Populates the Disorder table in the django database.
Handles loading the data and passing it to the django database
......@@ -31,17 +62,63 @@ class DataPopulator:
int: Count of how many disorders were added
"""
df = DataLoader.load_disorders()
bulk = list()
for _, row in df.iterrows():
bulk.append(models.Disorder(
self.disorders[row['mondo_id']] = models.Disorder(
mondo_id=row['mondo_id'],
label=row['label'],
icd10=row['icd10']
))
models.Disorder.objects.bulk_create(bulk)
)
models.Disorder.objects.bulk_create(self.disorders.values())
return len(self.disorders)
def populate_drugs(self):
df = DataLoader.load_drugs()
for _, row in df.iterrows():
drug_id = row['drug_id']
drug_name = row['drug_name']
drug_status = row['drug_status']
self.drugs[drug_id] = models.Drug(
drug_id=drug_id,
name=drug_name,
status=drug_status)
models.Drug.objects.bulk_create(self.drugs.values())
return len(self.drugs)
def populate_expessions(self):
self.init_proteins()
df = DataLoader.load_expressions()
tissues_models = dict()
for tissue_name in df.columns.values[2:]:
try:
tissue_model = models.Tissue.objects.get(name=tissue_name)
except models.Tissue.DoesNotExist:
tissue_model = models.Tissue.objects.create(name=tissue_name)
tissues_models[tissue_name] = tissue_model
proteins_linked = 0
unique = set()
bulk = list()
for _, row in df.iterrows():
gene_name = row['Description']
for protein_id in self.gene_name_to_ensembl[gene_name]:
protein_model = self.proteins[protein_id]
proteins_linked += 1
for tissue_name, tissue_model in tissues_models.items():
id = f"{tissue_name}_{protein_id}"
if id in unique:
continue
unique.add(id)
bulk.append(models.ExpressionLevel(protein=protein_model,
tissue=tissue_model,
expression_level=row[tissue_name]))
models.ExpressionLevel.objects.bulk_create(bulk)
return len(bulk)
def populate_ensg() -> int:
def populate_ensg(self) -> int:
""" Populates the Ensembl-Gene table in the django database.
Also maps the added ensg entries to the corresponding proteins.
Handles loading the data and passing it to the django database
......@@ -49,22 +126,24 @@ class DataPopulator:
Returns:
int: Count of how many ensg-protein relations were added
"""
self.init_proteins()
data = DataLoader.load_ensg()
bulk = list()
for entrez, ensg_list in data.items():
protein = models.Protein.objects.get(entrez=entrez)
protein = self.proteins[entrez]
for ensg in ensg_list:
bulk.append(models.EnsemblGene(name=ensg, protein=protein))
models.EnsemblGene.objects.bulk_create(bulk)
return len(bulk)
def populate_ppi_string() -> int:
def populate_ppi_string(self) -> int:
""" Populates the Protein-Protein-Interactions from STRINGdb
Handles loading the data and passing it to the django database
Returns:
int: Count of how many interactions were added
"""
self.init_proteins()
df = DataLoader.load_ppi_string()
dataset, _ = models.PPIDataset.objects.get_or_create(
name='STRING',
......@@ -75,9 +154,9 @@ class DataPopulator:
for _, row in df.iterrows():
try:
# try fetching proteins
protein_a = models.Protein.objects.get(entrez=row['entrez_a'])
protein_b = models.Protein.objects.get(entrez=row['entrez_b'])
except models.Protein.DoesNotExist:
protein_a = self.proteins[row['entrez_a']]
protein_b = self.proteins[row['entrez_b']]
except KeyError:
# continue if not found
continue
try:
......@@ -92,13 +171,14 @@ class DataPopulator:
models.ProteinProteinInteraction.objects.bulk_create(bulk)
return len(bulk)
def populate_ppi_apid() -> int:
def populate_ppi_apid(self) -> int:
""" Populates the Protein-Protein-Interactions from Apid
Handles loading the data and passing it to the django database
Returns:
int: Count of how many interactions were added
"""
self.init_proteins()
df = DataLoader.load_ppi_apid()
dataset, _ = models.PPIDataset.objects.get_or_create(
name='APID',
......@@ -109,9 +189,9 @@ class DataPopulator:
for _, row in df.iterrows():
try:
# try fetching proteins
protein_a = models.Protein.objects.get(uniprot_code=row['from_protein_ac'])
protein_b = models.Protein.objects.get(uniprot_code=row['to_protein_ac'])
except models.Protein.DoesNotExist:
protein_a = self.proteins[self.uniprot_to_ensembl[row['from_protein_ac']]]
protein_b = self.proteins[self.uniprot_to_ensembl[row['to_protein_ac']]]
except KeyError:
# continue if not found
continue
try:
......@@ -121,18 +201,18 @@ class DataPopulator:
to_protein=protein_b
))
except models.ValidationError:
# duplicate
continue
models.ProteinProteinInteraction.objects.bulk_create(bulk)
return len(bulk)
def populate_ppi_biogrid() -> int:
def populate_ppi_biogrid(self) -> int:
""" Populates the Protein-Protein-Interactions from BioGRID
Handles loading the data and passing it to the django database
Returns:
int: Count of how many interactions were added
"""
self.init_proteins()
df = DataLoader.load_ppi_biogrid()
dataset, _ = models.PPIDataset.objects.get_or_create(
name='BioGRID',
......@@ -143,9 +223,10 @@ class DataPopulator:
for _, row in df.iterrows():
try:
# try fetching proteins
protein_a = models.Protein.objects.get(entrez=row['entrez_a'])
protein_b = models.Protein.objects.get(entrez=row['entrez_b'])
except models.Protein.DoesNotExist:
protein_a = self.proteins[row['entrez_a']]
protein_b = self.proteins[row['entrez_b']]
except KeyError:
# TODO update error
# continue if not found
continue
try:
......@@ -160,13 +241,15 @@ class DataPopulator:
models.ProteinProteinInteraction.objects.bulk_create(bulk)
return len(bulk)
def populate_pdi_chembl() -> int:
def populate_pdi_chembl(self) -> int:
""" Populates the Protein-Drug-Interactions from Chembl
Handles Loading the data and passing it to the django database
Returns:
int: Count of how many interactions were added
"""
self.init_proteins()
self.init_drugs()
df = DataLoader.load_pdi_chembl()
dataset, _ = models.PDIDataset.objects.get_or_create(
name='ChEMBL',
......@@ -176,15 +259,14 @@ class DataPopulator:
bulk = list()
for _, row in df.iterrows():
try:
# try fetching protein
protein = models.Protein.objects.get(uniprot_code=row['protein_ac'])
except models.Protein.DoesNotExist:
protein = self.proteins[self.uniprot_to_ensembl[row['protein_ac']]]
except KeyError:
# continue if not found
continue
try:
# try fetching drug
drug = models.Drug.objects.get(drug_id=row['drug_id'])
except models.Drug.DoesNotExist:
drug = self.drugs[row['drug_id']]
except KeyError:
# continue if not found
continue
bulk.append(models.ProteinDrugInteraction(
......@@ -195,13 +277,15 @@ class DataPopulator:
models.ProteinDrugInteraction.objects.bulk_create(bulk)
return len(bulk)
def populate_pdis_disgenet() -> int:
def populate_pdis_disgenet(self,) -> int:
""" Populates the Protein-Disorder-Interactions from DisGeNET
Handles Loading the data and passing it to the django database
Returns:
int: Count of how many interactions were added
"""
self.init_proteins()
self.init_disorders()
df = DataLoader.load_pdis_disgenet()
dataset, _ = models.PDisDataset.objects.get_or_create(
name='DisGeNET',
......@@ -212,14 +296,14 @@ class DataPopulator:
for _, row in df.iterrows():
try:
# try fetching protein
protein = models.Protein.objects.get(uniprot_code=row['protein_name'])
except models.Protein.DoesNotExist:
protein = self.proteins[self.uniprot_to_ensembl[row['protein_name']]]
except KeyError:
# continue if not found
continue
try:
# try fetching drug
disorder = models.Disorder.objects.get(mondo_id=row['disorder_name'])
except models.Disorder.DoesNotExist:
disorder = self.disorders[str(int(row['disorder_name']))]
except KeyError:
# continue if not found
continue
bulk.append(models.ProteinDisorderAssociation(
......@@ -231,13 +315,15 @@ class DataPopulator:
models.ProteinDisorderAssociation.objects.bulk_create(bulk)
return len(bulk)
def populate_drdis_drugbank() -> int:
def populate_drdis_drugbank(self) -> int:
""" Populates the Drug-Disorder-Indications from DrugBank
Handles Loading the data and passing it to the django database
Returns:
int: Count of how many edges were added
"""
self.init_drugs()
self.init_disorders()
df = DataLoader.load_drdis_drugbank()
dataset, _ = models.DrDiDataset.objects.get_or_create(
name='DrugBank',
......@@ -248,14 +334,14 @@ class DataPopulator:
for _, row in df.iterrows():
try:
# try fetching protein
drug = models.Drug.objects.get(drug_id=row['drugbank_id'])
except models.Drug.DoesNotExist:
drug = self.drugs[row['drugbank_id']]
except KeyError:
# continue if not found
continue
try:
# try fetching drug
disorder = models.Disorder.objects.get(mondo_id=row['mondo_id'])
except models.Disorder.DoesNotExist:
disorder = self.disorders[str(int(row['mondo_id']))]
except KeyError:
# continue if not found
continue
bulk.append(models.DrugDisorderIndication(
......@@ -266,13 +352,15 @@ class DataPopulator:
models.DrugDisorderIndication.objects.bulk_create(bulk)
return len(bulk)
def populate_pdi_dgidb() -> int:
def populate_pdi_dgidb(self) -> int:
""" Populates the Protein-Drug-Interactions from DGIdb
Handles Loading the data and passing it to the django database
Returns:
int: Count of how many interactions were added
"""
self.init_proteins()
self.init_drugs()
df = DataLoader.load_pdi_dgidb()
dataset, _ = models.PDIDataset.objects.get_or_create(
name='DGIdb',
......@@ -283,14 +371,14 @@ class DataPopulator:
for _, row in df.iterrows():
try:
# try fetching protein
protein = models.Protein.objects.get(entrez=row['entrez_id'])
except models.Protein.DoesNotExist:
protein = self.proteins[row['entrez_id']]
except KeyError:
# continue if not found
continue
try:
# try fetching drug
drug = models.Drug.objects.get(drug_id=row['drug_id'])
except models.Drug.DoesNotExist:
drug = self.drugs[row['drug_id']]
except KeyError:
# continue if not found
continue
bulk.append(models.ProteinDrugInteraction(
......@@ -301,13 +389,15 @@ class DataPopulator:
models.ProteinDrugInteraction.objects.bulk_create(bulk)
return len(bulk)
def populate_pdi_drugbank() -> int:
def populate_pdi_drugbank(self) -> int:
""" Populates the Protein-Drug-Interactions from Drugbank
Handles Loading the data and passing it to the django database
Returns:
int: Count of how many interactions were added
"""
self.init_proteins()
self.init_drugs()
df = DataLoader.load_pdi_drugbank()
dataset, _ = models.PDIDataset.objects.get_or_create(
name='DrugBank',
......@@ -318,14 +408,14 @@ class DataPopulator:
for _, row in df.iterrows():
try:
# try fetching protein
protein = models.Protein.objects.get(entrez=row['entrez_id'])
except models.Protein.DoesNotExist:
protein = self.proteins[row['entrez_id']]
except KeyError:
# continue if not found
continue
try:
# try fetching drug
drug = models.Drug.objects.get(drug_id=row['drug_id'])
except models.Drug.DoesNotExist:
drug = self.drugs[row['drug_id']]
except KeyError:
# continue if not found
continue
bulk.append(models.ProteinDrugInteraction(
......
......@@ -78,6 +78,7 @@ class ProteinDrugInteractionView(APIView):
def get(self, request) -> Response:
if request.query_params.get('proteins'):
print("getting drugs for proteins")
protein_ac_list = json.loads(request.query_params.get('proteins'))
proteins = list(Protein.objects.filter(uniprot_code__in=protein_ac_list).all())
else:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment