Skip to content
Snippets Groups Projects
Commit 8fe8745b authored by AndiMajore's avatar AndiMajore
Browse files

switched to bulk inserts

parent 267b8afd
No related branches found
No related tags found
No related merge requests found
Pipeline #11952 failed
......@@ -11,8 +11,8 @@ services:
env_file:
- 'docker-django.env.dev'
restart: always
volumes:
- ./:/usr/src/drugstone/
# volumes:
# - ./:/usr/src/drugstone/
ports:
- 8001:8000
networks:
......@@ -60,8 +60,8 @@ services:
hostname: drugstone_celery
env_file:
- './docker-django.env.dev'
volumes:
- ./:/usr/src/drugstone/
# volumes:
# - ./:/usr/src/drugstone/
depends_on:
- redis
- db
......@@ -76,8 +76,8 @@ services:
hostname: drugstone_celery_beat
env_file:
- './docker-django.env.dev'
volumes:
- ./:/usr/src/drugstone/
# volumes:
# - ./:/usr/src/drugstone/
depends_on:
- redis
- db
......
# from collections import defaultdict
#
#
# def import_proteins():
# import python_nedrex as nedrex
# from python_nedrex.core import get_nodes, get_api_key, get_edges
# from models import Protein
#
# def iter_node_collection(coll_name, eval):
# offset = 0
# limit = 10000
# while True:
# result = get_nodes(coll_name, offset=offset, limit=limit)
# if not result:
# return
# for node in result:
# eval(node)
# offset += limit
#
# def iter_edge_collection(coll_name, eval):
# offset = 0
# limit = 10000
# while True:
# result = get_edges(coll_name, offset=offset, limit=limit)
# if not result:
# return
# for edge in result:
# eval(edge)
# offset += limit
#
# def add_protein(node):
# global proteins
# id = node['primaryDomainId']
# proteins[id] = Protein(uniprot_code=id.split('.')[1], gene=node['geneName'])
#
# def add_edges(edge):
# global proteins
# id = edge['sourceDomainId']
# protein = proteins[id]
# protein.entrez = edge['targetDomainId'].split('.')[1]
# global gene_to_prots
# gene_to_prots[edge['targetDomainId']].add(id)
#
# def add_genes(node):
# global proteins
# global gene_to_prots
# id = node['primaryDomainId']
# for prot_id in gene_to_prots[id]:
# protein = proteins[prot_id]
# try:
# protein.protein_name = node['synonyms'][0]
# except:
# pass
#
# nedrex.config.set_url_base("http://82.148.225.92:8123/")
# api_key = get_api_key(accept_eula=True)
# nedrex.config.set_api_key(api_key)
#
# proteins = dict()
# gene_to_prots = defaultdict(lambda: set())
#
# print('Importing Proteins')
# iter_node_collection('protein', add_protein)
# print('Importing Protein-Gene mapping')
# iter_edge_collection('protein_encoded_by_gene', add_edges)
# print('Mapping Gene information')
# iter_node_collection('gene', add_genes)
# Protein.objects.bulk_create(proteins.values())
......@@ -74,47 +74,6 @@ class DatabasePopulator:
print('Done!\n')
# def populate_protein_model(self):
# print('Populating Protein model ...')
# protein_df = pd.read_csv(f'{self.data_dir}/data_drugstone/{self.protein_file}', delimiter='\t')
# for _, row in protein_df.iterrows():
# protein_ac = row['protein_ac']
# gene_name = row['gene_name']
# protein_name = row['protein_name']
# if gene_name == 'None':
# gene_name = ''
# protein_object = Protein(uniprot_code=protein_ac, gene=gene_name, protein_name=protein_name)
# protein_object.save()
# print('Done!\n')
# def populate_pdi_model(self):
# print('Populating ProteinDrugInteraction model ...')
# pdi_df = pd.read_csv(f'{self.data_dir}/data_drugstone/{self.pdi_file}', delimiter='\t')
# for _, row in pdi_df.iterrows():
# protein_ac = row['protein_ac']
# drug_id = row['drug_id']
# try:
# protein_object = Protein.objects.get(uniprot_code=protein_ac)
# except Protein.DoesNotExist:
# print(f'Protein AC {protein_ac} not found in Protein model!')
# continue
# try:
# drug_object = Drug.objects.get(drug_id=drug_id)
# except Drug.DoesNotExist:
# print(f'Drug ID {drug_id} not found in Drug model!')
# continue
# # insert protein-drug to PDI model
# pdi_object = ProteinDrugInteraction(protein=protein_object, drug=drug_object)
# pdi_object.save()
# print('Done!\n')
def populate_exp_model(self):
print('Populating Tissue and ExpressionLevel model ...')
......@@ -146,32 +105,6 @@ class DatabasePopulator:
print(f'Added {proteins_linked} expression levels!\n')
# def populate_ppi_model(self):
# print('Populating ProteinProteinInteraction model ...')
# pdi_df = pd.read_csv(f'{self.data_dir}/data_drugstone/{self.ppi_file}', delimiter='\t')
# for _, row in pdi_df.iterrows():
# from_protein_ac = row['from_protein_ac']
# to_protein_ac = row['to_protein_ac']
# try:
# from_protein_object = Protein.objects.get(uniprot_code=from_protein_ac)
# except Protein.DoesNotExist:
# print(f'Protein AC {from_protein_ac} not found in Protein model!')
# continue
# try:
# to_protein_object = Protein.objects.get(uniprot_code=to_protein_ac)
# except Protein.DoesNotExist:
# print(f'Protein AC {to_protein_ac} not found in Protein model!')
# continue
# # insert protein-protein edge to ProteinProteinInteraction model
# ppi_object = ProteinProteinInteraction(from_protein=from_protein_object, to_protein=to_protein_object)
# ppi_object.save()
# print('Done!\n')
class Command(BaseCommand):
......
......@@ -12,17 +12,16 @@ class DataPopulator:
int: Count of how many proteins were added
"""
df = DataLoader.load_proteins()
count = 0
proteins = list()
for _, row in df.iterrows():
_, created = models.Protein.objects.update_or_create(
proteins.append(models.Protein(
uniprot_code=row['protein_ac'],
gene=row['gene_name'],
entrez=row['entrez_id'],
defaults={'protein_name': row['protein_name']}
protein_name=row['protein_name'])
)
if created:
count += 1
return count
models.Protein.objects.bulk_create(proteins)
return len(proteins)
def populate_disorders() -> int:
""" Populates the Disorder table in the django database.
......@@ -32,17 +31,15 @@ class DataPopulator:
int: Count of how many disorders were added
"""
df = DataLoader.load_disorders()
count = 0
bulk = list()
for _, row in df.iterrows():
_, created = models.Disorder.objects.update_or_create(
bulk.append(models.Disorder(
mondo_id=row['mondo_id'],
label=row['label'],
icd10=row['icd10'],
defaults={'label': row['label']}
)
if created:
count += 1
return count
icd10=row['icd10']
))
models.Disorder.objects.bulk_create(bulk)
return len(bulk)
def populate_ensg() -> int:
""" Populates the Ensembl-Gene table in the django database.
......@@ -53,14 +50,13 @@ class DataPopulator:
int: Count of how many ensg-protein relations were added
"""
data = DataLoader.load_ensg()
count = 0
bulk = list()
for entrez, ensg_list in data.items():
protein = models.Protein.objects.get(entrez=entrez)
for ensg in ensg_list:
_, created = models.EnsemblGene.objects.get_or_create(name=ensg, protein=protein)
if created:
count += 1
return count
bulk.append(models.EnsemblGene(name=ensg, protein=protein))
models.EnsemblGene.objects.bulk_create(bulk)
return len(bulk)
def populate_ppi_string() -> int:
""" Populates the Protein-Protein-Interactions from STRINGdb
......@@ -75,7 +71,7 @@ class DataPopulator:
link='https://string-db.org/',
version='11.0'
)
count = 0
bulk = list()
for _, row in df.iterrows():
try:
# try fetching proteins
......@@ -85,17 +81,16 @@ class DataPopulator:
# continue if not found
continue
try:
_, created = models.ProteinProteinInteraction.objects.get_or_create(
bulk.append(models.ProteinProteinInteraction(
ppi_dataset=dataset,
from_protein=protein_a,
to_protein=protein_b
)
if created:
count += 1
))
except models.ValidationError:
# duplicate
continue
return count
models.ProteinProteinInteraction.objects.bulk_create(bulk)
return len(bulk)
def populate_ppi_apid() -> int:
""" Populates the Protein-Protein-Interactions from Apid
......@@ -110,7 +105,7 @@ class DataPopulator:
link='http://cicblade.dep.usal.es:8080/APID/',
version='January 2019'
)
count = 0
bulk = list()
for _, row in df.iterrows():
try:
# try fetching proteins
......@@ -120,17 +115,16 @@ class DataPopulator:
# continue if not found
continue
try:
_, created = models.ProteinProteinInteraction.objects.get_or_create(
bulk.append(models.ProteinProteinInteraction(
ppi_dataset=dataset,
from_protein=protein_a,
to_protein=protein_b
)
if created:
count += 1
))
except models.ValidationError:
# duplicate
continue
return count
models.ProteinProteinInteraction.objects.bulk_create(bulk)
return len(bulk)
def populate_ppi_biogrid() -> int:
""" Populates the Protein-Protein-Interactions from BioGRID
......@@ -145,7 +139,7 @@ class DataPopulator:
link='https://thebiogrid.org/',
version='4.0'
)
count = 0
bulk = list()
for _, row in df.iterrows():
try:
# try fetching proteins
......@@ -155,17 +149,16 @@ class DataPopulator:
# continue if not found
continue
try:
_, created = models.ProteinProteinInteraction.objects.get_or_create(
bulk.append(models.ProteinProteinInteraction(
ppi_dataset=dataset,
from_protein=protein_a,
to_protein=protein_b
)
if created:
count += 1
))
except models.ValidationError:
# duplicate
continue
return count
models.ProteinProteinInteraction.objects.bulk_create(bulk)
return len(bulk)
def populate_pdi_chembl() -> int:
""" Populates the Protein-Drug-Interactions from Chembl
......@@ -180,8 +173,8 @@ class DataPopulator:
link='https://www.ebi.ac.uk/chembl/',
version='27',
)
count = 0
for index, row in df.iterrows():
bulk = list()
for _, row in df.iterrows():
try:
# try fetching protein
protein = models.Protein.objects.get(uniprot_code=row['protein_ac'])
......@@ -194,14 +187,13 @@ class DataPopulator:
except models.Drug.DoesNotExist:
# continue if not found
continue
_, created = models.ProteinDrugInteraction.objects.get_or_create(
bulk.append(models.ProteinDrugInteraction(
pdi_dataset=dataset,
protein=protein,
drug=drug
)
if created:
count += 1
return count
))
models.ProteinProteinInteraction.objects.bulk_create(bulk)
return len(bulk)
def populate_pdis_disgenet() -> int:
""" Populates the Protein-Disorder-Interactions from DisGeNET
......@@ -216,8 +208,8 @@ class DataPopulator:
link='https://www.disgenet.org/home/',
version='6.0',
)
count = 0
for index, row in df.iterrows():
bulk = list()
for _, row in df.iterrows():
try:
# try fetching protein
protein = models.Protein.objects.get(uniprot_code=row['protein_name'])
......@@ -230,15 +222,14 @@ class DataPopulator:
except models.Disorder.DoesNotExist:
# continue if not found
continue
_, created = models.ProteinDisorderAssociation.objects.get_or_create(
bulk.append(models.ProteinDisorderAssociation(
pdis_dataset=dataset,
protein=protein,
disorder=disorder,
score=row['score']
)
if created:
count += 1
return count
))
models.ProteinDisorderAssociation.objects.bulk_create(bulk)
return len(bulk)
def populate_drdis_drugbank() -> int:
""" Populates the Drug-Disorder-Indications from DrugBank
......@@ -253,8 +244,8 @@ class DataPopulator:
link='https://go.drugbank.com/',
version='5.1.8',
)
count = 0
for index, row in df.iterrows():
bulk = list()
for _, row in df.iterrows():
try:
# try fetching protein
drug = models.Drug.objects.get(drug_id=row['drugbank_id'])
......@@ -267,14 +258,13 @@ class DataPopulator:
except models.Disorder.DoesNotExist:
# continue if not found
continue
_, created = models.DrugDisorderIndication.objects.get_or_create(
bulk.append(models.DrugDisorderIndication(
drdi_dataset=dataset,
drug=drug,
disorder=disorder,
)
if created:
count += 1
return count
))
models.DrugDisorderIndication.objects.bulk_create(bulk)
return len(bulk)
def populate_pdi_dgidb() -> int:
""" Populates the Protein-Drug-Interactions from DGIdb
......@@ -289,7 +279,7 @@ class DataPopulator:
link='https://www.dgidb.org/',
version='4.2.0'
)
count = 0
bulk = list()
for _, row in df.iterrows():
try:
# try fetching protein
......@@ -303,14 +293,13 @@ class DataPopulator:
except models.Drug.DoesNotExist:
# continue if not found
continue
_, created = models.ProteinDrugInteraction.objects.get_or_create(
bulk.append(models.ProteinDrugInteraction(
pdi_dataset=dataset,
protein=protein,
drug=drug
)
if created:
count += 1
return count
))
models.ProteinDrugInteraction.objects.bulk_create(bulk)
return len(bulk)
def populate_pdi_drugbank() -> int:
""" Populates the Protein-Drug-Interactions from Drugbank
......@@ -325,7 +314,7 @@ class DataPopulator:
link='https://go.drugbank.com/',
version='5.1.7'
)
count = 0
bulk = list()
for _, row in df.iterrows():
try:
# try fetching protein
......@@ -339,11 +328,10 @@ class DataPopulator:
except models.Drug.DoesNotExist:
# continue if not found
continue
_, created = models.ProteinDrugInteraction.objects.get_or_create(
bulk.append(models.ProteinDrugInteraction(
pdi_dataset=dataset,
protein=protein,
drug=drug
)
if created:
count += 1
return count
))
models.ProteinDrugInteraction.objects.bulk_create(bulk)
return len(bulk)
......@@ -3,6 +3,6 @@ from celery.schedules import crontab
CELERY_BEAT_SCHEDULE = {
'update_db': {
'task': 'drugstone.tasks.task_update_db_from_nedrex',
'schedule': crontab(minute='*/1000'),
'schedule': crontab(minute='*/1'),
},
}
......@@ -24,7 +24,7 @@ BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
SECRET_KEY = os.environ.get('SECRET_KEY')
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = os.environ.get('DEBUG', False)
DEBUG = os.environ.get('DEBUG') == '1'
ALLOWED_HOSTS = [
'localhost',
......@@ -45,6 +45,7 @@ INSTALLED_APPS = [
'django.contrib.staticfiles',
'corsheaders',
'drugstone',
# 'python_nedrex',
'rest_framework',
]
......
......@@ -11,8 +11,8 @@ def task_update_db_from_nedrex():
print('here')
logger.info('Fetching data...')
fetch_nedrex_data()
# fetch_nedrex_data()
logger.info('Integrating data...')
integrate_nedrex_data()
# integrate_nedrex_data()
logger.info('Done.')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment