Skip to content
Snippets Groups Projects
Commit d961f24e authored by AndiMajore's avatar AndiMajore
Browse files

switched to bulk inserts

Former-commit-id: f7cfdb56
parent 2807af39
No related branches found
No related tags found
No related merge requests found
...@@ -11,8 +11,8 @@ services: ...@@ -11,8 +11,8 @@ services:
env_file: env_file:
- 'docker-django.env.dev' - 'docker-django.env.dev'
restart: always restart: always
volumes: # volumes:
- ./:/usr/src/drugstone/ # - ./:/usr/src/drugstone/
ports: ports:
- 8001:8000 - 8001:8000
networks: networks:
...@@ -60,8 +60,8 @@ services: ...@@ -60,8 +60,8 @@ services:
hostname: drugstone_celery hostname: drugstone_celery
env_file: env_file:
- './docker-django.env.dev' - './docker-django.env.dev'
volumes: # volumes:
- ./:/usr/src/drugstone/ # - ./:/usr/src/drugstone/
depends_on: depends_on:
- redis - redis
- db - db
...@@ -76,8 +76,8 @@ services: ...@@ -76,8 +76,8 @@ services:
hostname: drugstone_celery_beat hostname: drugstone_celery_beat
env_file: env_file:
- './docker-django.env.dev' - './docker-django.env.dev'
volumes: # volumes:
- ./:/usr/src/drugstone/ # - ./:/usr/src/drugstone/
depends_on: depends_on:
- redis - redis
- db - db
......
# from collections import defaultdict
#
#
# def import_proteins():
# import python_nedrex as nedrex
# from python_nedrex.core import get_nodes, get_api_key, get_edges
# from models import Protein
#
# def iter_node_collection(coll_name, eval):
# offset = 0
# limit = 10000
# while True:
# result = get_nodes(coll_name, offset=offset, limit=limit)
# if not result:
# return
# for node in result:
# eval(node)
# offset += limit
#
# def iter_edge_collection(coll_name, eval):
# offset = 0
# limit = 10000
# while True:
# result = get_edges(coll_name, offset=offset, limit=limit)
# if not result:
# return
# for edge in result:
# eval(edge)
# offset += limit
#
# def add_protein(node):
# global proteins
# id = node['primaryDomainId']
# proteins[id] = Protein(uniprot_code=id.split('.')[1], gene=node['geneName'])
#
# def add_edges(edge):
# global proteins
# id = edge['sourceDomainId']
# protein = proteins[id]
# protein.entrez = edge['targetDomainId'].split('.')[1]
# global gene_to_prots
# gene_to_prots[edge['targetDomainId']].add(id)
#
# def add_genes(node):
# global proteins
# global gene_to_prots
# id = node['primaryDomainId']
# for prot_id in gene_to_prots[id]:
# protein = proteins[prot_id]
# try:
# protein.protein_name = node['synonyms'][0]
# except:
# pass
#
# nedrex.config.set_url_base("http://82.148.225.92:8123/")
# api_key = get_api_key(accept_eula=True)
# nedrex.config.set_api_key(api_key)
#
# proteins = dict()
# gene_to_prots = defaultdict(lambda: set())
#
# print('Importing Proteins')
# iter_node_collection('protein', add_protein)
# print('Importing Protein-Gene mapping')
# iter_edge_collection('protein_encoded_by_gene', add_edges)
# print('Mapping Gene information')
# iter_node_collection('gene', add_genes)
# Protein.objects.bulk_create(proteins.values())
...@@ -74,47 +74,6 @@ class DatabasePopulator: ...@@ -74,47 +74,6 @@ class DatabasePopulator:
print('Done!\n') print('Done!\n')
# def populate_protein_model(self):
# print('Populating Protein model ...')
# protein_df = pd.read_csv(f'{self.data_dir}/data_drugstone/{self.protein_file}', delimiter='\t')
# for _, row in protein_df.iterrows():
# protein_ac = row['protein_ac']
# gene_name = row['gene_name']
# protein_name = row['protein_name']
# if gene_name == 'None':
# gene_name = ''
# protein_object = Protein(uniprot_code=protein_ac, gene=gene_name, protein_name=protein_name)
# protein_object.save()
# print('Done!\n')
# def populate_pdi_model(self):
# print('Populating ProteinDrugInteraction model ...')
# pdi_df = pd.read_csv(f'{self.data_dir}/data_drugstone/{self.pdi_file}', delimiter='\t')
# for _, row in pdi_df.iterrows():
# protein_ac = row['protein_ac']
# drug_id = row['drug_id']
# try:
# protein_object = Protein.objects.get(uniprot_code=protein_ac)
# except Protein.DoesNotExist:
# print(f'Protein AC {protein_ac} not found in Protein model!')
# continue
# try:
# drug_object = Drug.objects.get(drug_id=drug_id)
# except Drug.DoesNotExist:
# print(f'Drug ID {drug_id} not found in Drug model!')
# continue
# # insert protein-drug to PDI model
# pdi_object = ProteinDrugInteraction(protein=protein_object, drug=drug_object)
# pdi_object.save()
# print('Done!\n')
def populate_exp_model(self): def populate_exp_model(self):
print('Populating Tissue and ExpressionLevel model ...') print('Populating Tissue and ExpressionLevel model ...')
...@@ -146,32 +105,6 @@ class DatabasePopulator: ...@@ -146,32 +105,6 @@ class DatabasePopulator:
print(f'Added {proteins_linked} expression levels!\n') print(f'Added {proteins_linked} expression levels!\n')
# def populate_ppi_model(self):
# print('Populating ProteinProteinInteraction model ...')
# pdi_df = pd.read_csv(f'{self.data_dir}/data_drugstone/{self.ppi_file}', delimiter='\t')
# for _, row in pdi_df.iterrows():
# from_protein_ac = row['from_protein_ac']
# to_protein_ac = row['to_protein_ac']
# try:
# from_protein_object = Protein.objects.get(uniprot_code=from_protein_ac)
# except Protein.DoesNotExist:
# print(f'Protein AC {from_protein_ac} not found in Protein model!')
# continue
# try:
# to_protein_object = Protein.objects.get(uniprot_code=to_protein_ac)
# except Protein.DoesNotExist:
# print(f'Protein AC {to_protein_ac} not found in Protein model!')
# continue
# # insert protein-protein edge to ProteinProteinInteraction model
# ppi_object = ProteinProteinInteraction(from_protein=from_protein_object, to_protein=to_protein_object)
# ppi_object.save()
# print('Done!\n')
class Command(BaseCommand): class Command(BaseCommand):
......
...@@ -12,17 +12,16 @@ class DataPopulator: ...@@ -12,17 +12,16 @@ class DataPopulator:
int: Count of how many proteins were added int: Count of how many proteins were added
""" """
df = DataLoader.load_proteins() df = DataLoader.load_proteins()
count = 0 proteins = list()
for _, row in df.iterrows(): for _, row in df.iterrows():
_, created = models.Protein.objects.update_or_create( proteins.append(models.Protein(
uniprot_code=row['protein_ac'], uniprot_code=row['protein_ac'],
gene=row['gene_name'], gene=row['gene_name'],
entrez=row['entrez_id'], entrez=row['entrez_id'],
defaults={'protein_name': row['protein_name']} protein_name=row['protein_name'])
) )
if created: models.Protein.objects.bulk_create(proteins)
count += 1 return len(proteins)
return count
def populate_disorders() -> int: def populate_disorders() -> int:
""" Populates the Disorder table in the django database. """ Populates the Disorder table in the django database.
...@@ -32,17 +31,15 @@ class DataPopulator: ...@@ -32,17 +31,15 @@ class DataPopulator:
int: Count of how many disorders were added int: Count of how many disorders were added
""" """
df = DataLoader.load_disorders() df = DataLoader.load_disorders()
count = 0 bulk = list()
for _, row in df.iterrows(): for _, row in df.iterrows():
_, created = models.Disorder.objects.update_or_create( bulk.append(models.Disorder(
mondo_id=row['mondo_id'], mondo_id=row['mondo_id'],
label=row['label'], label=row['label'],
icd10=row['icd10'], icd10=row['icd10']
defaults={'label': row['label']} ))
) models.Disorder.objects.bulk_create(bulk)
if created: return len(bulk)
count += 1
return count
def populate_ensg() -> int: def populate_ensg() -> int:
""" Populates the Ensembl-Gene table in the django database. """ Populates the Ensembl-Gene table in the django database.
...@@ -53,14 +50,13 @@ class DataPopulator: ...@@ -53,14 +50,13 @@ class DataPopulator:
int: Count of how many ensg-protein relations were added int: Count of how many ensg-protein relations were added
""" """
data = DataLoader.load_ensg() data = DataLoader.load_ensg()
count = 0 bulk = list()
for entrez, ensg_list in data.items(): for entrez, ensg_list in data.items():
protein = models.Protein.objects.get(entrez=entrez) protein = models.Protein.objects.get(entrez=entrez)
for ensg in ensg_list: for ensg in ensg_list:
_, created = models.EnsemblGene.objects.get_or_create(name=ensg, protein=protein) bulk.append(models.EnsemblGene(name=ensg, protein=protein))
if created: models.EnsemblGene.objects.bulk_create(bulk)
count += 1 return len(bulk)
return count
def populate_ppi_string() -> int: def populate_ppi_string() -> int:
""" Populates the Protein-Protein-Interactions from STRINGdb """ Populates the Protein-Protein-Interactions from STRINGdb
...@@ -75,7 +71,7 @@ class DataPopulator: ...@@ -75,7 +71,7 @@ class DataPopulator:
link='https://string-db.org/', link='https://string-db.org/',
version='11.0' version='11.0'
) )
count = 0 bulk = list()
for _, row in df.iterrows(): for _, row in df.iterrows():
try: try:
# try fetching proteins # try fetching proteins
...@@ -85,17 +81,16 @@ class DataPopulator: ...@@ -85,17 +81,16 @@ class DataPopulator:
# continue if not found # continue if not found
continue continue
try: try:
_, created = models.ProteinProteinInteraction.objects.get_or_create( bulk.append(models.ProteinProteinInteraction(
ppi_dataset=dataset, ppi_dataset=dataset,
from_protein=protein_a, from_protein=protein_a,
to_protein=protein_b to_protein=protein_b
) ))
if created:
count += 1
except models.ValidationError: except models.ValidationError:
# duplicate # duplicate
continue continue
return count models.ProteinProteinInteraction.objects.bulk_create(bulk)
return len(bulk)
def populate_ppi_apid() -> int: def populate_ppi_apid() -> int:
""" Populates the Protein-Protein-Interactions from Apid """ Populates the Protein-Protein-Interactions from Apid
...@@ -110,7 +105,7 @@ class DataPopulator: ...@@ -110,7 +105,7 @@ class DataPopulator:
link='http://cicblade.dep.usal.es:8080/APID/', link='http://cicblade.dep.usal.es:8080/APID/',
version='January 2019' version='January 2019'
) )
count = 0 bulk = list()
for _, row in df.iterrows(): for _, row in df.iterrows():
try: try:
# try fetching proteins # try fetching proteins
...@@ -120,17 +115,16 @@ class DataPopulator: ...@@ -120,17 +115,16 @@ class DataPopulator:
# continue if not found # continue if not found
continue continue
try: try:
_, created = models.ProteinProteinInteraction.objects.get_or_create( bulk.append(models.ProteinProteinInteraction(
ppi_dataset=dataset, ppi_dataset=dataset,
from_protein=protein_a, from_protein=protein_a,
to_protein=protein_b to_protein=protein_b
) ))
if created:
count += 1
except models.ValidationError: except models.ValidationError:
# duplicate # duplicate
continue continue
return count models.ProteinProteinInteraction.objects.bulk_create(bulk)
return len(bulk)
def populate_ppi_biogrid() -> int: def populate_ppi_biogrid() -> int:
""" Populates the Protein-Protein-Interactions from BioGRID """ Populates the Protein-Protein-Interactions from BioGRID
...@@ -145,7 +139,7 @@ class DataPopulator: ...@@ -145,7 +139,7 @@ class DataPopulator:
link='https://thebiogrid.org/', link='https://thebiogrid.org/',
version='4.0' version='4.0'
) )
count = 0 bulk = list()
for _, row in df.iterrows(): for _, row in df.iterrows():
try: try:
# try fetching proteins # try fetching proteins
...@@ -155,17 +149,16 @@ class DataPopulator: ...@@ -155,17 +149,16 @@ class DataPopulator:
# continue if not found # continue if not found
continue continue
try: try:
_, created = models.ProteinProteinInteraction.objects.get_or_create( bulk.append(models.ProteinProteinInteraction(
ppi_dataset=dataset, ppi_dataset=dataset,
from_protein=protein_a, from_protein=protein_a,
to_protein=protein_b to_protein=protein_b
) ))
if created:
count += 1
except models.ValidationError: except models.ValidationError:
# duplicate # duplicate
continue continue
return count models.ProteinProteinInteraction.objects.bulk_create(bulk)
return len(bulk)
def populate_pdi_chembl() -> int: def populate_pdi_chembl() -> int:
""" Populates the Protein-Drug-Interactions from Chembl """ Populates the Protein-Drug-Interactions from Chembl
...@@ -180,8 +173,8 @@ class DataPopulator: ...@@ -180,8 +173,8 @@ class DataPopulator:
link='https://www.ebi.ac.uk/chembl/', link='https://www.ebi.ac.uk/chembl/',
version='27', version='27',
) )
count = 0 bulk = list()
for index, row in df.iterrows(): for _, row in df.iterrows():
try: try:
# try fetching protein # try fetching protein
protein = models.Protein.objects.get(uniprot_code=row['protein_ac']) protein = models.Protein.objects.get(uniprot_code=row['protein_ac'])
...@@ -194,14 +187,13 @@ class DataPopulator: ...@@ -194,14 +187,13 @@ class DataPopulator:
except models.Drug.DoesNotExist: except models.Drug.DoesNotExist:
# continue if not found # continue if not found
continue continue
_, created = models.ProteinDrugInteraction.objects.get_or_create( bulk.append(models.ProteinDrugInteraction(
pdi_dataset=dataset, pdi_dataset=dataset,
protein=protein, protein=protein,
drug=drug drug=drug
) ))
if created: models.ProteinProteinInteraction.objects.bulk_create(bulk)
count += 1 return len(bulk)
return count
def populate_pdis_disgenet() -> int: def populate_pdis_disgenet() -> int:
""" Populates the Protein-Disorder-Interactions from DisGeNET """ Populates the Protein-Disorder-Interactions from DisGeNET
...@@ -216,8 +208,8 @@ class DataPopulator: ...@@ -216,8 +208,8 @@ class DataPopulator:
link='https://www.disgenet.org/home/', link='https://www.disgenet.org/home/',
version='6.0', version='6.0',
) )
count = 0 bulk = list()
for index, row in df.iterrows(): for _, row in df.iterrows():
try: try:
# try fetching protein # try fetching protein
protein = models.Protein.objects.get(uniprot_code=row['protein_name']) protein = models.Protein.objects.get(uniprot_code=row['protein_name'])
...@@ -230,15 +222,14 @@ class DataPopulator: ...@@ -230,15 +222,14 @@ class DataPopulator:
except models.Disorder.DoesNotExist: except models.Disorder.DoesNotExist:
# continue if not found # continue if not found
continue continue
_, created = models.ProteinDisorderAssociation.objects.get_or_create( bulk.append(models.ProteinDisorderAssociation(
pdis_dataset=dataset, pdis_dataset=dataset,
protein=protein, protein=protein,
disorder=disorder, disorder=disorder,
score=row['score'] score=row['score']
) ))
if created: models.ProteinDisorderAssociation.objects.bulk_create(bulk)
count += 1 return len(bulk)
return count
def populate_drdis_drugbank() -> int: def populate_drdis_drugbank() -> int:
""" Populates the Drug-Disorder-Indications from DrugBank """ Populates the Drug-Disorder-Indications from DrugBank
...@@ -253,8 +244,8 @@ class DataPopulator: ...@@ -253,8 +244,8 @@ class DataPopulator:
link='https://go.drugbank.com/', link='https://go.drugbank.com/',
version='5.1.8', version='5.1.8',
) )
count = 0 bulk = list()
for index, row in df.iterrows(): for _, row in df.iterrows():
try: try:
# try fetching protein # try fetching protein
drug = models.Drug.objects.get(drug_id=row['drugbank_id']) drug = models.Drug.objects.get(drug_id=row['drugbank_id'])
...@@ -267,14 +258,13 @@ class DataPopulator: ...@@ -267,14 +258,13 @@ class DataPopulator:
except models.Disorder.DoesNotExist: except models.Disorder.DoesNotExist:
# continue if not found # continue if not found
continue continue
_, created = models.DrugDisorderIndication.objects.get_or_create( bulk.append(models.DrugDisorderIndication(
drdi_dataset=dataset, drdi_dataset=dataset,
drug=drug, drug=drug,
disorder=disorder, disorder=disorder,
) ))
if created: models.DrugDisorderIndication.objects.bulk_create(bulk)
count += 1 return len(bulk)
return count
def populate_pdi_dgidb() -> int: def populate_pdi_dgidb() -> int:
""" Populates the Protein-Drug-Interactions from DGIdb """ Populates the Protein-Drug-Interactions from DGIdb
...@@ -289,7 +279,7 @@ class DataPopulator: ...@@ -289,7 +279,7 @@ class DataPopulator:
link='https://www.dgidb.org/', link='https://www.dgidb.org/',
version='4.2.0' version='4.2.0'
) )
count = 0 bulk = list()
for _, row in df.iterrows(): for _, row in df.iterrows():
try: try:
# try fetching protein # try fetching protein
...@@ -303,14 +293,13 @@ class DataPopulator: ...@@ -303,14 +293,13 @@ class DataPopulator:
except models.Drug.DoesNotExist: except models.Drug.DoesNotExist:
# continue if not found # continue if not found
continue continue
_, created = models.ProteinDrugInteraction.objects.get_or_create( bulk.append(models.ProteinDrugInteraction(
pdi_dataset=dataset, pdi_dataset=dataset,
protein=protein, protein=protein,
drug=drug drug=drug
) ))
if created: models.ProteinDrugInteraction.objects.bulk_create(bulk)
count += 1 return len(bulk)
return count
def populate_pdi_drugbank() -> int: def populate_pdi_drugbank() -> int:
""" Populates the Protein-Drug-Interactions from Drugbank """ Populates the Protein-Drug-Interactions from Drugbank
...@@ -325,7 +314,7 @@ class DataPopulator: ...@@ -325,7 +314,7 @@ class DataPopulator:
link='https://go.drugbank.com/', link='https://go.drugbank.com/',
version='5.1.7' version='5.1.7'
) )
count = 0 bulk = list()
for _, row in df.iterrows(): for _, row in df.iterrows():
try: try:
# try fetching protein # try fetching protein
...@@ -339,11 +328,10 @@ class DataPopulator: ...@@ -339,11 +328,10 @@ class DataPopulator:
except models.Drug.DoesNotExist: except models.Drug.DoesNotExist:
# continue if not found # continue if not found
continue continue
_, created = models.ProteinDrugInteraction.objects.get_or_create( bulk.append(models.ProteinDrugInteraction(
pdi_dataset=dataset, pdi_dataset=dataset,
protein=protein, protein=protein,
drug=drug drug=drug
) ))
if created: models.ProteinDrugInteraction.objects.bulk_create(bulk)
count += 1 return len(bulk)
return count
...@@ -3,6 +3,6 @@ from celery.schedules import crontab ...@@ -3,6 +3,6 @@ from celery.schedules import crontab
CELERY_BEAT_SCHEDULE = { CELERY_BEAT_SCHEDULE = {
'update_db': { 'update_db': {
'task': 'drugstone.tasks.task_update_db_from_nedrex', 'task': 'drugstone.tasks.task_update_db_from_nedrex',
'schedule': crontab(minute='*/1000'), 'schedule': crontab(minute='*/1'),
}, },
} }
...@@ -24,7 +24,7 @@ BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) ...@@ -24,7 +24,7 @@ BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
SECRET_KEY = os.environ.get('SECRET_KEY') SECRET_KEY = os.environ.get('SECRET_KEY')
# SECURITY WARNING: don't run with debug turned on in production! # SECURITY WARNING: don't run with debug turned on in production!
DEBUG = os.environ.get('DEBUG', False) DEBUG = os.environ.get('DEBUG') == '1'
ALLOWED_HOSTS = [ ALLOWED_HOSTS = [
'localhost', 'localhost',
...@@ -45,6 +45,7 @@ INSTALLED_APPS = [ ...@@ -45,6 +45,7 @@ INSTALLED_APPS = [
'django.contrib.staticfiles', 'django.contrib.staticfiles',
'corsheaders', 'corsheaders',
'drugstone', 'drugstone',
# 'python_nedrex',
'rest_framework', 'rest_framework',
] ]
......
...@@ -11,8 +11,8 @@ def task_update_db_from_nedrex(): ...@@ -11,8 +11,8 @@ def task_update_db_from_nedrex():
print('here') print('here')
logger.info('Fetching data...') logger.info('Fetching data...')
fetch_nedrex_data() # fetch_nedrex_data()
logger.info('Integrating data...') logger.info('Integrating data...')
integrate_nedrex_data() # integrate_nedrex_data()
logger.info('Done.') logger.info('Done.')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment