Skip to content
Snippets Groups Projects
Commit 052fca65 authored by AndiMajore's avatar AndiMajore
Browse files

switched to bulk inserts

Former-commit-id: e597062bdda4bb05012c8651bb5a5c56400224cb [formerly 13fab6dfcbe97065258dfaa5092ef5cf8231f901]
Former-commit-id: 77d907ddff45e4e927d2b7a400559e3298510fed
parent 47f867fe
No related branches found
No related tags found
No related merge requests found
......@@ -11,8 +11,8 @@ services:
env_file:
- 'docker-django.env.dev'
restart: always
volumes:
- ./:/usr/src/drugstone/
# volumes:
# - ./:/usr/src/drugstone/
ports:
- 8001:8000
networks:
......@@ -60,8 +60,8 @@ services:
hostname: drugstone_celery
env_file:
- './docker-django.env.dev'
volumes:
- ./:/usr/src/drugstone/
# volumes:
# - ./:/usr/src/drugstone/
depends_on:
- redis
- db
......@@ -76,8 +76,8 @@ services:
hostname: drugstone_celery_beat
env_file:
- './docker-django.env.dev'
volumes:
- ./:/usr/src/drugstone/
# volumes:
# - ./:/usr/src/drugstone/
depends_on:
- redis
- db
......
# from collections import defaultdict
#
#
# def import_proteins():
# import python_nedrex as nedrex
# from python_nedrex.core import get_nodes, get_api_key, get_edges
# from models import Protein
#
# def iter_node_collection(coll_name, eval):
# offset = 0
# limit = 10000
# while True:
# result = get_nodes(coll_name, offset=offset, limit=limit)
# if not result:
# return
# for node in result:
# eval(node)
# offset += limit
#
# def iter_edge_collection(coll_name, eval):
# offset = 0
# limit = 10000
# while True:
# result = get_edges(coll_name, offset=offset, limit=limit)
# if not result:
# return
# for edge in result:
# eval(edge)
# offset += limit
#
# def add_protein(node):
# global proteins
# id = node['primaryDomainId']
# proteins[id] = Protein(uniprot_code=id.split('.')[1], gene=node['geneName'])
#
# def add_edges(edge):
# global proteins
# id = edge['sourceDomainId']
# protein = proteins[id]
# protein.entrez = edge['targetDomainId'].split('.')[1]
# global gene_to_prots
# gene_to_prots[edge['targetDomainId']].add(id)
#
# def add_genes(node):
# global proteins
# global gene_to_prots
# id = node['primaryDomainId']
# for prot_id in gene_to_prots[id]:
# protein = proteins[prot_id]
# try:
# protein.protein_name = node['synonyms'][0]
# except:
# pass
#
# nedrex.config.set_url_base("http://82.148.225.92:8123/")
# api_key = get_api_key(accept_eula=True)
# nedrex.config.set_api_key(api_key)
#
# proteins = dict()
# gene_to_prots = defaultdict(lambda: set())
#
# print('Importing Proteins')
# iter_node_collection('protein', add_protein)
# print('Importing Protein-Gene mapping')
# iter_edge_collection('protein_encoded_by_gene', add_edges)
# print('Mapping Gene information')
# iter_node_collection('gene', add_genes)
# Protein.objects.bulk_create(proteins.values())
......@@ -74,47 +74,6 @@ class DatabasePopulator:
print('Done!\n')
# def populate_protein_model(self):
# print('Populating Protein model ...')
# protein_df = pd.read_csv(f'{self.data_dir}/data_drugstone/{self.protein_file}', delimiter='\t')
# for _, row in protein_df.iterrows():
# protein_ac = row['protein_ac']
# gene_name = row['gene_name']
# protein_name = row['protein_name']
# if gene_name == 'None':
# gene_name = ''
# protein_object = Protein(uniprot_code=protein_ac, gene=gene_name, protein_name=protein_name)
# protein_object.save()
# print('Done!\n')
# def populate_pdi_model(self):
# print('Populating ProteinDrugInteraction model ...')
# pdi_df = pd.read_csv(f'{self.data_dir}/data_drugstone/{self.pdi_file}', delimiter='\t')
# for _, row in pdi_df.iterrows():
# protein_ac = row['protein_ac']
# drug_id = row['drug_id']
# try:
# protein_object = Protein.objects.get(uniprot_code=protein_ac)
# except Protein.DoesNotExist:
# print(f'Protein AC {protein_ac} not found in Protein model!')
# continue
# try:
# drug_object = Drug.objects.get(drug_id=drug_id)
# except Drug.DoesNotExist:
# print(f'Drug ID {drug_id} not found in Drug model!')
# continue
# # insert protein-drug to PDI model
# pdi_object = ProteinDrugInteraction(protein=protein_object, drug=drug_object)
# pdi_object.save()
# print('Done!\n')
def populate_exp_model(self):
print('Populating Tissue and ExpressionLevel model ...')
......@@ -146,32 +105,6 @@ class DatabasePopulator:
print(f'Added {proteins_linked} expression levels!\n')
# def populate_ppi_model(self):
# print('Populating ProteinProteinInteraction model ...')
# pdi_df = pd.read_csv(f'{self.data_dir}/data_drugstone/{self.ppi_file}', delimiter='\t')
# for _, row in pdi_df.iterrows():
# from_protein_ac = row['from_protein_ac']
# to_protein_ac = row['to_protein_ac']
# try:
# from_protein_object = Protein.objects.get(uniprot_code=from_protein_ac)
# except Protein.DoesNotExist:
# print(f'Protein AC {from_protein_ac} not found in Protein model!')
# continue
# try:
# to_protein_object = Protein.objects.get(uniprot_code=to_protein_ac)
# except Protein.DoesNotExist:
# print(f'Protein AC {to_protein_ac} not found in Protein model!')
# continue
# # insert protein-protein edge to ProteinProteinInteraction model
# ppi_object = ProteinProteinInteraction(from_protein=from_protein_object, to_protein=to_protein_object)
# ppi_object.save()
# print('Done!\n')
class Command(BaseCommand):
......
......@@ -12,17 +12,16 @@ class DataPopulator:
int: Count of how many proteins were added
"""
df = DataLoader.load_proteins()
count = 0
proteins = list()
for _, row in df.iterrows():
_, created = models.Protein.objects.update_or_create(
proteins.append(models.Protein(
uniprot_code=row['protein_ac'],
gene=row['gene_name'],
entrez=row['entrez_id'],
defaults={'protein_name': row['protein_name']}
protein_name=row['protein_name'])
)
if created:
count += 1
return count
models.Protein.objects.bulk_create(proteins)
return len(proteins)
def populate_disorders() -> int:
""" Populates the Disorder table in the django database.
......@@ -32,17 +31,15 @@ class DataPopulator:
int: Count of how many disorders were added
"""
df = DataLoader.load_disorders()
count = 0
bulk = list()
for _, row in df.iterrows():
_, created = models.Disorder.objects.update_or_create(
bulk.append(models.Disorder(
mondo_id=row['mondo_id'],
label=row['label'],
icd10=row['icd10'],
defaults={'label': row['label']}
)
if created:
count += 1
return count
icd10=row['icd10']
))
models.Disorder.objects.bulk_create(bulk)
return len(bulk)
def populate_ensg() -> int:
""" Populates the Ensembl-Gene table in the django database.
......@@ -53,14 +50,13 @@ class DataPopulator:
int: Count of how many ensg-protein relations were added
"""
data = DataLoader.load_ensg()
count = 0
bulk = list()
for entrez, ensg_list in data.items():
protein = models.Protein.objects.get(entrez=entrez)
for ensg in ensg_list:
_, created = models.EnsemblGene.objects.get_or_create(name=ensg, protein=protein)
if created:
count += 1
return count
bulk.append(models.EnsemblGene(name=ensg, protein=protein))
models.EnsemblGene.objects.bulk_create(bulk)
return len(bulk)
def populate_ppi_string() -> int:
""" Populates the Protein-Protein-Interactions from STRINGdb
......@@ -74,8 +70,8 @@ class DataPopulator:
name='STRING',
link='https://string-db.org/',
version='11.0'
)
count = 0
)
bulk = list()
for _, row in df.iterrows():
try:
# try fetching proteins
......@@ -85,17 +81,16 @@ class DataPopulator:
# continue if not found
continue
try:
_, created = models.ProteinProteinInteraction.objects.get_or_create(
bulk.append(models.ProteinProteinInteraction(
ppi_dataset=dataset,
from_protein=protein_a,
to_protein=protein_b
)
if created:
count += 1
))
except models.ValidationError:
# duplicate
continue
return count
models.ProteinProteinInteraction.objects.bulk_create(bulk)
return len(bulk)
def populate_ppi_apid() -> int:
""" Populates the Protein-Protein-Interactions from Apid
......@@ -109,8 +104,8 @@ class DataPopulator:
name='APID',
link='http://cicblade.dep.usal.es:8080/APID/',
version='January 2019'
)
count = 0
)
bulk = list()
for _, row in df.iterrows():
try:
# try fetching proteins
......@@ -120,17 +115,16 @@ class DataPopulator:
# continue if not found
continue
try:
_, created = models.ProteinProteinInteraction.objects.get_or_create(
bulk.append(models.ProteinProteinInteraction(
ppi_dataset=dataset,
from_protein=protein_a,
to_protein=protein_b
)
if created:
count += 1
))
except models.ValidationError:
# duplicate
continue
return count
models.ProteinProteinInteraction.objects.bulk_create(bulk)
return len(bulk)
def populate_ppi_biogrid() -> int:
""" Populates the Protein-Protein-Interactions from BioGRID
......@@ -144,8 +138,8 @@ class DataPopulator:
name='BioGRID',
link='https://thebiogrid.org/',
version='4.0'
)
count = 0
)
bulk = list()
for _, row in df.iterrows():
try:
# try fetching proteins
......@@ -155,17 +149,16 @@ class DataPopulator:
# continue if not found
continue
try:
_, created = models.ProteinProteinInteraction.objects.get_or_create(
bulk.append(models.ProteinProteinInteraction(
ppi_dataset=dataset,
from_protein=protein_a,
to_protein=protein_b
)
if created:
count += 1
))
except models.ValidationError:
# duplicate
continue
return count
models.ProteinProteinInteraction.objects.bulk_create(bulk)
return len(bulk)
def populate_pdi_chembl() -> int:
""" Populates the Protein-Drug-Interactions from Chembl
......@@ -179,9 +172,9 @@ class DataPopulator:
name='ChEMBL',
link='https://www.ebi.ac.uk/chembl/',
version='27',
)
count = 0
for index, row in df.iterrows():
)
bulk = list()
for _, row in df.iterrows():
try:
# try fetching protein
protein = models.Protein.objects.get(uniprot_code=row['protein_ac'])
......@@ -194,14 +187,13 @@ class DataPopulator:
except models.Drug.DoesNotExist:
# continue if not found
continue
_, created = models.ProteinDrugInteraction.objects.get_or_create(
bulk.append(models.ProteinDrugInteraction(
pdi_dataset=dataset,
protein=protein,
drug=drug
)
if created:
count += 1
return count
))
models.ProteinProteinInteraction.objects.bulk_create(bulk)
return len(bulk)
def populate_pdis_disgenet() -> int:
""" Populates the Protein-Disorder-Interactions from DisGeNET
......@@ -215,9 +207,9 @@ class DataPopulator:
name='DisGeNET',
link='https://www.disgenet.org/home/',
version='6.0',
)
count = 0
for index, row in df.iterrows():
)
bulk = list()
for _, row in df.iterrows():
try:
# try fetching protein
protein = models.Protein.objects.get(uniprot_code=row['protein_name'])
......@@ -230,15 +222,14 @@ class DataPopulator:
except models.Disorder.DoesNotExist:
# continue if not found
continue
_, created = models.ProteinDisorderAssociation.objects.get_or_create(
bulk.append(models.ProteinDisorderAssociation(
pdis_dataset=dataset,
protein=protein,
disorder=disorder,
score=row['score']
)
if created:
count += 1
return count
))
models.ProteinDisorderAssociation.objects.bulk_create(bulk)
return len(bulk)
def populate_drdis_drugbank() -> int:
""" Populates the Drug-Disorder-Indications from DrugBank
......@@ -253,8 +244,8 @@ class DataPopulator:
link='https://go.drugbank.com/',
version='5.1.8',
)
count = 0
for index, row in df.iterrows():
bulk = list()
for _, row in df.iterrows():
try:
# try fetching protein
drug = models.Drug.objects.get(drug_id=row['drugbank_id'])
......@@ -267,14 +258,13 @@ class DataPopulator:
except models.Disorder.DoesNotExist:
# continue if not found
continue
_, created = models.DrugDisorderIndication.objects.get_or_create(
bulk.append(models.DrugDisorderIndication(
drdi_dataset=dataset,
drug=drug,
disorder=disorder,
)
if created:
count += 1
return count
))
models.DrugDisorderIndication.objects.bulk_create(bulk)
return len(bulk)
def populate_pdi_dgidb() -> int:
""" Populates the Protein-Drug-Interactions from DGIdb
......@@ -288,8 +278,8 @@ class DataPopulator:
name='DGIdb',
link='https://www.dgidb.org/',
version='4.2.0'
)
count = 0
)
bulk = list()
for _, row in df.iterrows():
try:
# try fetching protein
......@@ -303,14 +293,13 @@ class DataPopulator:
except models.Drug.DoesNotExist:
# continue if not found
continue
_, created = models.ProteinDrugInteraction.objects.get_or_create(
bulk.append(models.ProteinDrugInteraction(
pdi_dataset=dataset,
protein=protein,
drug=drug
)
if created:
count += 1
return count
))
models.ProteinDrugInteraction.objects.bulk_create(bulk)
return len(bulk)
def populate_pdi_drugbank() -> int:
""" Populates the Protein-Drug-Interactions from Drugbank
......@@ -324,8 +313,8 @@ class DataPopulator:
name='DrugBank',
link='https://go.drugbank.com/',
version='5.1.7'
)
count = 0
)
bulk = list()
for _, row in df.iterrows():
try:
# try fetching protein
......@@ -339,11 +328,10 @@ class DataPopulator:
except models.Drug.DoesNotExist:
# continue if not found
continue
_, created = models.ProteinDrugInteraction.objects.get_or_create(
bulk.append(models.ProteinDrugInteraction(
pdi_dataset=dataset,
protein=protein,
drug=drug
)
if created:
count += 1
return count
))
models.ProteinDrugInteraction.objects.bulk_create(bulk)
return len(bulk)
......@@ -3,6 +3,6 @@ from celery.schedules import crontab
CELERY_BEAT_SCHEDULE = {
'update_db': {
'task': 'drugstone.tasks.task_update_db_from_nedrex',
'schedule': crontab(minute='*/1000'),
'schedule': crontab(minute='*/1'),
},
}
......@@ -24,7 +24,7 @@ BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
SECRET_KEY = os.environ.get('SECRET_KEY')
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = os.environ.get('DEBUG', False)
DEBUG = os.environ.get('DEBUG') == '1'
ALLOWED_HOSTS = [
'localhost',
......@@ -45,6 +45,7 @@ INSTALLED_APPS = [
'django.contrib.staticfiles',
'corsheaders',
'drugstone',
# 'python_nedrex',
'rest_framework',
]
......
......@@ -11,8 +11,8 @@ def task_update_db_from_nedrex():
print('here')
logger.info('Fetching data...')
fetch_nedrex_data()
# fetch_nedrex_data()
logger.info('Integrating data...')
integrate_nedrex_data()
# integrate_nedrex_data()
logger.info('Done.')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment