Skip to content
Snippets Groups Projects
Commit 8fe8745b authored by AndiMajore's avatar AndiMajore
Browse files

switched to bulk inserts

parent 267b8afd
No related branches found
No related tags found
No related merge requests found
Pipeline #11952 failed
...@@ -11,8 +11,8 @@ services: ...@@ -11,8 +11,8 @@ services:
env_file: env_file:
- 'docker-django.env.dev' - 'docker-django.env.dev'
restart: always restart: always
volumes: # volumes:
- ./:/usr/src/drugstone/ # - ./:/usr/src/drugstone/
ports: ports:
- 8001:8000 - 8001:8000
networks: networks:
...@@ -60,8 +60,8 @@ services: ...@@ -60,8 +60,8 @@ services:
hostname: drugstone_celery hostname: drugstone_celery
env_file: env_file:
- './docker-django.env.dev' - './docker-django.env.dev'
volumes: # volumes:
- ./:/usr/src/drugstone/ # - ./:/usr/src/drugstone/
depends_on: depends_on:
- redis - redis
- db - db
...@@ -76,8 +76,8 @@ services: ...@@ -76,8 +76,8 @@ services:
hostname: drugstone_celery_beat hostname: drugstone_celery_beat
env_file: env_file:
- './docker-django.env.dev' - './docker-django.env.dev'
volumes: # volumes:
- ./:/usr/src/drugstone/ # - ./:/usr/src/drugstone/
depends_on: depends_on:
- redis - redis
- db - db
......
# from collections import defaultdict
#
#
# def import_proteins():
# import python_nedrex as nedrex
# from python_nedrex.core import get_nodes, get_api_key, get_edges
# from models import Protein
#
# def iter_node_collection(coll_name, eval):
# offset = 0
# limit = 10000
# while True:
# result = get_nodes(coll_name, offset=offset, limit=limit)
# if not result:
# return
# for node in result:
# eval(node)
# offset += limit
#
# def iter_edge_collection(coll_name, eval):
# offset = 0
# limit = 10000
# while True:
# result = get_edges(coll_name, offset=offset, limit=limit)
# if not result:
# return
# for edge in result:
# eval(edge)
# offset += limit
#
# def add_protein(node):
# global proteins
# id = node['primaryDomainId']
# proteins[id] = Protein(uniprot_code=id.split('.')[1], gene=node['geneName'])
#
# def add_edges(edge):
# global proteins
# id = edge['sourceDomainId']
# protein = proteins[id]
# protein.entrez = edge['targetDomainId'].split('.')[1]
# global gene_to_prots
# gene_to_prots[edge['targetDomainId']].add(id)
#
# def add_genes(node):
# global proteins
# global gene_to_prots
# id = node['primaryDomainId']
# for prot_id in gene_to_prots[id]:
# protein = proteins[prot_id]
# try:
# protein.protein_name = node['synonyms'][0]
# except:
# pass
#
# nedrex.config.set_url_base("http://82.148.225.92:8123/")
# api_key = get_api_key(accept_eula=True)
# nedrex.config.set_api_key(api_key)
#
# proteins = dict()
# gene_to_prots = defaultdict(lambda: set())
#
# print('Importing Proteins')
# iter_node_collection('protein', add_protein)
# print('Importing Protein-Gene mapping')
# iter_edge_collection('protein_encoded_by_gene', add_edges)
# print('Mapping Gene information')
# iter_node_collection('gene', add_genes)
# Protein.objects.bulk_create(proteins.values())
...@@ -74,47 +74,6 @@ class DatabasePopulator: ...@@ -74,47 +74,6 @@ class DatabasePopulator:
print('Done!\n') print('Done!\n')
# def populate_protein_model(self):
# print('Populating Protein model ...')
# protein_df = pd.read_csv(f'{self.data_dir}/data_drugstone/{self.protein_file}', delimiter='\t')
# for _, row in protein_df.iterrows():
# protein_ac = row['protein_ac']
# gene_name = row['gene_name']
# protein_name = row['protein_name']
# if gene_name == 'None':
# gene_name = ''
# protein_object = Protein(uniprot_code=protein_ac, gene=gene_name, protein_name=protein_name)
# protein_object.save()
# print('Done!\n')
# def populate_pdi_model(self):
# print('Populating ProteinDrugInteraction model ...')
# pdi_df = pd.read_csv(f'{self.data_dir}/data_drugstone/{self.pdi_file}', delimiter='\t')
# for _, row in pdi_df.iterrows():
# protein_ac = row['protein_ac']
# drug_id = row['drug_id']
# try:
# protein_object = Protein.objects.get(uniprot_code=protein_ac)
# except Protein.DoesNotExist:
# print(f'Protein AC {protein_ac} not found in Protein model!')
# continue
# try:
# drug_object = Drug.objects.get(drug_id=drug_id)
# except Drug.DoesNotExist:
# print(f'Drug ID {drug_id} not found in Drug model!')
# continue
# # insert protein-drug to PDI model
# pdi_object = ProteinDrugInteraction(protein=protein_object, drug=drug_object)
# pdi_object.save()
# print('Done!\n')
def populate_exp_model(self): def populate_exp_model(self):
print('Populating Tissue and ExpressionLevel model ...') print('Populating Tissue and ExpressionLevel model ...')
...@@ -146,32 +105,6 @@ class DatabasePopulator: ...@@ -146,32 +105,6 @@ class DatabasePopulator:
print(f'Added {proteins_linked} expression levels!\n') print(f'Added {proteins_linked} expression levels!\n')
# def populate_ppi_model(self):
# print('Populating ProteinProteinInteraction model ...')
# pdi_df = pd.read_csv(f'{self.data_dir}/data_drugstone/{self.ppi_file}', delimiter='\t')
# for _, row in pdi_df.iterrows():
# from_protein_ac = row['from_protein_ac']
# to_protein_ac = row['to_protein_ac']
# try:
# from_protein_object = Protein.objects.get(uniprot_code=from_protein_ac)
# except Protein.DoesNotExist:
# print(f'Protein AC {from_protein_ac} not found in Protein model!')
# continue
# try:
# to_protein_object = Protein.objects.get(uniprot_code=to_protein_ac)
# except Protein.DoesNotExist:
# print(f'Protein AC {to_protein_ac} not found in Protein model!')
# continue
# # insert protein-protein edge to ProteinProteinInteraction model
# ppi_object = ProteinProteinInteraction(from_protein=from_protein_object, to_protein=to_protein_object)
# ppi_object.save()
# print('Done!\n')
class Command(BaseCommand): class Command(BaseCommand):
......
...@@ -12,17 +12,16 @@ class DataPopulator: ...@@ -12,17 +12,16 @@ class DataPopulator:
int: Count of how many proteins were added int: Count of how many proteins were added
""" """
df = DataLoader.load_proteins() df = DataLoader.load_proteins()
count = 0 proteins = list()
for _, row in df.iterrows(): for _, row in df.iterrows():
_, created = models.Protein.objects.update_or_create( proteins.append(models.Protein(
uniprot_code=row['protein_ac'], uniprot_code=row['protein_ac'],
gene=row['gene_name'], gene=row['gene_name'],
entrez=row['entrez_id'], entrez=row['entrez_id'],
defaults={'protein_name': row['protein_name']} protein_name=row['protein_name'])
) )
if created: models.Protein.objects.bulk_create(proteins)
count += 1 return len(proteins)
return count
def populate_disorders() -> int: def populate_disorders() -> int:
""" Populates the Disorder table in the django database. """ Populates the Disorder table in the django database.
...@@ -32,17 +31,15 @@ class DataPopulator: ...@@ -32,17 +31,15 @@ class DataPopulator:
int: Count of how many disorders were added int: Count of how many disorders were added
""" """
df = DataLoader.load_disorders() df = DataLoader.load_disorders()
count = 0 bulk = list()
for _, row in df.iterrows(): for _, row in df.iterrows():
_, created = models.Disorder.objects.update_or_create( bulk.append(models.Disorder(
mondo_id=row['mondo_id'], mondo_id=row['mondo_id'],
label=row['label'], label=row['label'],
icd10=row['icd10'], icd10=row['icd10']
defaults={'label': row['label']} ))
) models.Disorder.objects.bulk_create(bulk)
if created: return len(bulk)
count += 1
return count
def populate_ensg() -> int: def populate_ensg() -> int:
""" Populates the Ensembl-Gene table in the django database. """ Populates the Ensembl-Gene table in the django database.
...@@ -53,14 +50,13 @@ class DataPopulator: ...@@ -53,14 +50,13 @@ class DataPopulator:
int: Count of how many ensg-protein relations were added int: Count of how many ensg-protein relations were added
""" """
data = DataLoader.load_ensg() data = DataLoader.load_ensg()
count = 0 bulk = list()
for entrez, ensg_list in data.items(): for entrez, ensg_list in data.items():
protein = models.Protein.objects.get(entrez=entrez) protein = models.Protein.objects.get(entrez=entrez)
for ensg in ensg_list: for ensg in ensg_list:
_, created = models.EnsemblGene.objects.get_or_create(name=ensg, protein=protein) bulk.append(models.EnsemblGene(name=ensg, protein=protein))
if created: models.EnsemblGene.objects.bulk_create(bulk)
count += 1 return len(bulk)
return count
def populate_ppi_string() -> int: def populate_ppi_string() -> int:
""" Populates the Protein-Protein-Interactions from STRINGdb """ Populates the Protein-Protein-Interactions from STRINGdb
...@@ -74,8 +70,8 @@ class DataPopulator: ...@@ -74,8 +70,8 @@ class DataPopulator:
name='STRING', name='STRING',
link='https://string-db.org/', link='https://string-db.org/',
version='11.0' version='11.0'
) )
count = 0 bulk = list()
for _, row in df.iterrows(): for _, row in df.iterrows():
try: try:
# try fetching proteins # try fetching proteins
...@@ -85,17 +81,16 @@ class DataPopulator: ...@@ -85,17 +81,16 @@ class DataPopulator:
# continue if not found # continue if not found
continue continue
try: try:
_, created = models.ProteinProteinInteraction.objects.get_or_create( bulk.append(models.ProteinProteinInteraction(
ppi_dataset=dataset, ppi_dataset=dataset,
from_protein=protein_a, from_protein=protein_a,
to_protein=protein_b to_protein=protein_b
) ))
if created:
count += 1
except models.ValidationError: except models.ValidationError:
# duplicate # duplicate
continue continue
return count models.ProteinProteinInteraction.objects.bulk_create(bulk)
return len(bulk)
def populate_ppi_apid() -> int: def populate_ppi_apid() -> int:
""" Populates the Protein-Protein-Interactions from Apid """ Populates the Protein-Protein-Interactions from Apid
...@@ -109,8 +104,8 @@ class DataPopulator: ...@@ -109,8 +104,8 @@ class DataPopulator:
name='APID', name='APID',
link='http://cicblade.dep.usal.es:8080/APID/', link='http://cicblade.dep.usal.es:8080/APID/',
version='January 2019' version='January 2019'
) )
count = 0 bulk = list()
for _, row in df.iterrows(): for _, row in df.iterrows():
try: try:
# try fetching proteins # try fetching proteins
...@@ -120,17 +115,16 @@ class DataPopulator: ...@@ -120,17 +115,16 @@ class DataPopulator:
# continue if not found # continue if not found
continue continue
try: try:
_, created = models.ProteinProteinInteraction.objects.get_or_create( bulk.append(models.ProteinProteinInteraction(
ppi_dataset=dataset, ppi_dataset=dataset,
from_protein=protein_a, from_protein=protein_a,
to_protein=protein_b to_protein=protein_b
) ))
if created:
count += 1
except models.ValidationError: except models.ValidationError:
# duplicate # duplicate
continue continue
return count models.ProteinProteinInteraction.objects.bulk_create(bulk)
return len(bulk)
def populate_ppi_biogrid() -> int: def populate_ppi_biogrid() -> int:
""" Populates the Protein-Protein-Interactions from BioGRID """ Populates the Protein-Protein-Interactions from BioGRID
...@@ -144,8 +138,8 @@ class DataPopulator: ...@@ -144,8 +138,8 @@ class DataPopulator:
name='BioGRID', name='BioGRID',
link='https://thebiogrid.org/', link='https://thebiogrid.org/',
version='4.0' version='4.0'
) )
count = 0 bulk = list()
for _, row in df.iterrows(): for _, row in df.iterrows():
try: try:
# try fetching proteins # try fetching proteins
...@@ -155,17 +149,16 @@ class DataPopulator: ...@@ -155,17 +149,16 @@ class DataPopulator:
# continue if not found # continue if not found
continue continue
try: try:
_, created = models.ProteinProteinInteraction.objects.get_or_create( bulk.append(models.ProteinProteinInteraction(
ppi_dataset=dataset, ppi_dataset=dataset,
from_protein=protein_a, from_protein=protein_a,
to_protein=protein_b to_protein=protein_b
) ))
if created:
count += 1
except models.ValidationError: except models.ValidationError:
# duplicate # duplicate
continue continue
return count models.ProteinProteinInteraction.objects.bulk_create(bulk)
return len(bulk)
def populate_pdi_chembl() -> int: def populate_pdi_chembl() -> int:
""" Populates the Protein-Drug-Interactions from Chembl """ Populates the Protein-Drug-Interactions from Chembl
...@@ -179,9 +172,9 @@ class DataPopulator: ...@@ -179,9 +172,9 @@ class DataPopulator:
name='ChEMBL', name='ChEMBL',
link='https://www.ebi.ac.uk/chembl/', link='https://www.ebi.ac.uk/chembl/',
version='27', version='27',
) )
count = 0 bulk = list()
for index, row in df.iterrows(): for _, row in df.iterrows():
try: try:
# try fetching protein # try fetching protein
protein = models.Protein.objects.get(uniprot_code=row['protein_ac']) protein = models.Protein.objects.get(uniprot_code=row['protein_ac'])
...@@ -194,14 +187,13 @@ class DataPopulator: ...@@ -194,14 +187,13 @@ class DataPopulator:
except models.Drug.DoesNotExist: except models.Drug.DoesNotExist:
# continue if not found # continue if not found
continue continue
_, created = models.ProteinDrugInteraction.objects.get_or_create( bulk.append(models.ProteinDrugInteraction(
pdi_dataset=dataset, pdi_dataset=dataset,
protein=protein, protein=protein,
drug=drug drug=drug
) ))
if created: models.ProteinProteinInteraction.objects.bulk_create(bulk)
count += 1 return len(bulk)
return count
def populate_pdis_disgenet() -> int: def populate_pdis_disgenet() -> int:
""" Populates the Protein-Disorder-Interactions from DisGeNET """ Populates the Protein-Disorder-Interactions from DisGeNET
...@@ -215,9 +207,9 @@ class DataPopulator: ...@@ -215,9 +207,9 @@ class DataPopulator:
name='DisGeNET', name='DisGeNET',
link='https://www.disgenet.org/home/', link='https://www.disgenet.org/home/',
version='6.0', version='6.0',
) )
count = 0 bulk = list()
for index, row in df.iterrows(): for _, row in df.iterrows():
try: try:
# try fetching protein # try fetching protein
protein = models.Protein.objects.get(uniprot_code=row['protein_name']) protein = models.Protein.objects.get(uniprot_code=row['protein_name'])
...@@ -230,15 +222,14 @@ class DataPopulator: ...@@ -230,15 +222,14 @@ class DataPopulator:
except models.Disorder.DoesNotExist: except models.Disorder.DoesNotExist:
# continue if not found # continue if not found
continue continue
_, created = models.ProteinDisorderAssociation.objects.get_or_create( bulk.append(models.ProteinDisorderAssociation(
pdis_dataset=dataset, pdis_dataset=dataset,
protein=protein, protein=protein,
disorder=disorder, disorder=disorder,
score=row['score'] score=row['score']
) ))
if created: models.ProteinDisorderAssociation.objects.bulk_create(bulk)
count += 1 return len(bulk)
return count
def populate_drdis_drugbank() -> int: def populate_drdis_drugbank() -> int:
""" Populates the Drug-Disorder-Indications from DrugBank """ Populates the Drug-Disorder-Indications from DrugBank
...@@ -253,8 +244,8 @@ class DataPopulator: ...@@ -253,8 +244,8 @@ class DataPopulator:
link='https://go.drugbank.com/', link='https://go.drugbank.com/',
version='5.1.8', version='5.1.8',
) )
count = 0 bulk = list()
for index, row in df.iterrows(): for _, row in df.iterrows():
try: try:
# try fetching protein # try fetching protein
drug = models.Drug.objects.get(drug_id=row['drugbank_id']) drug = models.Drug.objects.get(drug_id=row['drugbank_id'])
...@@ -267,14 +258,13 @@ class DataPopulator: ...@@ -267,14 +258,13 @@ class DataPopulator:
except models.Disorder.DoesNotExist: except models.Disorder.DoesNotExist:
# continue if not found # continue if not found
continue continue
_, created = models.DrugDisorderIndication.objects.get_or_create( bulk.append(models.DrugDisorderIndication(
drdi_dataset=dataset, drdi_dataset=dataset,
drug=drug, drug=drug,
disorder=disorder, disorder=disorder,
) ))
if created: models.DrugDisorderIndication.objects.bulk_create(bulk)
count += 1 return len(bulk)
return count
def populate_pdi_dgidb() -> int: def populate_pdi_dgidb() -> int:
""" Populates the Protein-Drug-Interactions from DGIdb """ Populates the Protein-Drug-Interactions from DGIdb
...@@ -288,8 +278,8 @@ class DataPopulator: ...@@ -288,8 +278,8 @@ class DataPopulator:
name='DGIdb', name='DGIdb',
link='https://www.dgidb.org/', link='https://www.dgidb.org/',
version='4.2.0' version='4.2.0'
) )
count = 0 bulk = list()
for _, row in df.iterrows(): for _, row in df.iterrows():
try: try:
# try fetching protein # try fetching protein
...@@ -303,14 +293,13 @@ class DataPopulator: ...@@ -303,14 +293,13 @@ class DataPopulator:
except models.Drug.DoesNotExist: except models.Drug.DoesNotExist:
# continue if not found # continue if not found
continue continue
_, created = models.ProteinDrugInteraction.objects.get_or_create( bulk.append(models.ProteinDrugInteraction(
pdi_dataset=dataset, pdi_dataset=dataset,
protein=protein, protein=protein,
drug=drug drug=drug
) ))
if created: models.ProteinDrugInteraction.objects.bulk_create(bulk)
count += 1 return len(bulk)
return count
def populate_pdi_drugbank() -> int: def populate_pdi_drugbank() -> int:
""" Populates the Protein-Drug-Interactions from Drugbank """ Populates the Protein-Drug-Interactions from Drugbank
...@@ -324,8 +313,8 @@ class DataPopulator: ...@@ -324,8 +313,8 @@ class DataPopulator:
name='DrugBank', name='DrugBank',
link='https://go.drugbank.com/', link='https://go.drugbank.com/',
version='5.1.7' version='5.1.7'
) )
count = 0 bulk = list()
for _, row in df.iterrows(): for _, row in df.iterrows():
try: try:
# try fetching protein # try fetching protein
...@@ -339,11 +328,10 @@ class DataPopulator: ...@@ -339,11 +328,10 @@ class DataPopulator:
except models.Drug.DoesNotExist: except models.Drug.DoesNotExist:
# continue if not found # continue if not found
continue continue
_, created = models.ProteinDrugInteraction.objects.get_or_create( bulk.append(models.ProteinDrugInteraction(
pdi_dataset=dataset, pdi_dataset=dataset,
protein=protein, protein=protein,
drug=drug drug=drug
) ))
if created: models.ProteinDrugInteraction.objects.bulk_create(bulk)
count += 1 return len(bulk)
return count
...@@ -3,6 +3,6 @@ from celery.schedules import crontab ...@@ -3,6 +3,6 @@ from celery.schedules import crontab
CELERY_BEAT_SCHEDULE = { CELERY_BEAT_SCHEDULE = {
'update_db': { 'update_db': {
'task': 'drugstone.tasks.task_update_db_from_nedrex', 'task': 'drugstone.tasks.task_update_db_from_nedrex',
'schedule': crontab(minute='*/1000'), 'schedule': crontab(minute='*/1'),
}, },
} }
...@@ -24,7 +24,7 @@ BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) ...@@ -24,7 +24,7 @@ BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
SECRET_KEY = os.environ.get('SECRET_KEY') SECRET_KEY = os.environ.get('SECRET_KEY')
# SECURITY WARNING: don't run with debug turned on in production! # SECURITY WARNING: don't run with debug turned on in production!
DEBUG = os.environ.get('DEBUG', False) DEBUG = os.environ.get('DEBUG') == '1'
ALLOWED_HOSTS = [ ALLOWED_HOSTS = [
'localhost', 'localhost',
...@@ -45,6 +45,7 @@ INSTALLED_APPS = [ ...@@ -45,6 +45,7 @@ INSTALLED_APPS = [
'django.contrib.staticfiles', 'django.contrib.staticfiles',
'corsheaders', 'corsheaders',
'drugstone', 'drugstone',
# 'python_nedrex',
'rest_framework', 'rest_framework',
] ]
......
...@@ -11,8 +11,8 @@ def task_update_db_from_nedrex(): ...@@ -11,8 +11,8 @@ def task_update_db_from_nedrex():
print('here') print('here')
logger.info('Fetching data...') logger.info('Fetching data...')
fetch_nedrex_data() # fetch_nedrex_data()
logger.info('Integrating data...') logger.info('Integrating data...')
integrate_nedrex_data() # integrate_nedrex_data()
logger.info('Done.') logger.info('Done.')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment