Select Git revision
DataPopulator.py
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
DataPopulator.py 12.42 KiB
from drugstone.management.includes.DataLoader import DataLoader
import drugstone.models as models
from drugstone.management.includes.NodeCache import NodeCache
class DataPopulator:
def __init__(self, cache: NodeCache):
self.cache = cache
def populate_expressions(self, update):
self.cache.init_proteins()
df = DataLoader.load_expressions()
tissues_models = dict()
for tissue_name in df.columns.values[2:]:
tissue,_ = models.Tissue.objects.get_or_create(name=tissue_name)
tissues_models[tissue_name] = tissue
proteins_linked = 0
bulk = set()
uniq = set()
size = 0
for _, row in df.iterrows():
gene_name = row['Description']
for protein_model in self.cache.get_proteins_by_gene(gene_name):
proteins_linked += 1
if not update or self.cache.is_new_protein(protein_model):
for tissue_name, tissue_model in tissues_models.items():
expr = models.ExpressionLevel(protein=protein_model,
tissue=tissue_model,
expression_level=row[tissue_name])
id = hash(expr)
if id in uniq:
continue
uniq.add(id)
bulk.add(expr)
if len(bulk) > 100000:
models.ExpressionLevel.objects.bulk_create(bulk)
size += len(bulk)
bulk = set()
models.ExpressionLevel.objects.bulk_create(bulk)
return size + len(bulk)
def populate_ensg(self,update) -> int:
""" Populates the Ensembl-Gene table in the django database.
Also maps the added ensg entries to the corresponding proteins.
Handles loading the data and passing it to the django database
Returns:
int: Count of how many ensg-protein relations were added
"""
self.cache.init_proteins()
data = DataLoader.load_ensg()
bulk = list()
for entrez, ensg_list in data.items():
proteins = self.cache.get_proteins_by_entrez(entrez)
for protein in proteins:
for ensg in ensg_list:
if not update or self.cache.is_new_protein(protein):
bulk.append(models.EnsemblGene(name=ensg, protein=protein))
models.EnsemblGene.objects.bulk_create(bulk)
return len(bulk)
def populate_ppi_string(self, dataset, update) -> int:
""" Populates the Protein-Protein-Interactions from STRINGdb
Handles loading the data and passing it to the django database
Returns:
int: Count of how many interactions were added
"""
self.cache.init_proteins()
df = DataLoader.load_ppi_string()
bulk = list()
for _, row in df.iterrows():
try:
# try fetching proteins
proteins_a = self.cache.get_proteins_by_entrez(row['entrez_a'])
proteins_b = self.cache.get_proteins_by_entrez(row['entrez_b'])
except KeyError:
continue
for protein_a in proteins_a:
for protein_b in proteins_b:
if not update or (self.cache.is_new_protein(protein_a) or self.cache.is_new_protein(protein_b)):
bulk.append(models.ProteinProteinInteraction(
ppi_dataset=dataset,
from_protein=protein_a,
to_protein=protein_b
))
models.ProteinProteinInteraction.objects.bulk_create(bulk)
return len(bulk)
def populate_ppi_apid(self, dataset, update) -> int:
""" Populates the Protein-Protein-Interactions from Apid
Handles loading the data and passing it to the django database
Returns:
int: Count of how many interactions were added
"""
self.cache.init_proteins()
df = DataLoader.load_ppi_apid()
bulk = set()
for _, row in df.iterrows():
try:
# try fetching proteins
protein_a = self.cache.get_protein_by_uniprot(row['from_protein_ac'])
protein_b = self.cache.get_protein_by_uniprot(row['to_protein_ac'])
except KeyError:
# continue if not found
continue
if not update or (self.cache.is_new_protein(protein_a) or self.cache.is_new_protein(protein_b)):
bulk.add(models.ProteinProteinInteraction(
ppi_dataset=dataset,
from_protein=protein_a,
to_protein=protein_b
))
models.ProteinProteinInteraction.objects.bulk_create(bulk)
return len(bulk)
# def populate_ppi_biogrid(self,dataset, update) -> int:
# """ Populates the Protein-Protein-Interactions from BioGRID
# Handles loading the data and passing it to the django database
#
# Returns:
# int: Count of how many interactions were added
# """
# self.cache.init_proteins()
#
# df = DataLoader.load_ppi_biogrid()
# bulk = list()
# for _, row in df.iterrows():
# try:
# # try fetching proteins
# proteins_a = self.cache.get_proteins_by_entrez(row['entrez_a'])
# proteins_b = self.cache.get_proteins_by_entrez(row['entrez_b'])
# except KeyError:
# # TODO update error
# # continue if not found
# continue
# for protein_a in proteins_a:
# for protein_b in proteins_b:
# if not update or (self.cache.is_new_protein(protein_a) or self.cache.is_new_protein(protein_b)):
# bulk.append(models.ProteinProteinInteraction(
# ppi_dataset=dataset,
# from_protein=protein_a,
# to_protein=protein_b
# ))
# models.ProteinProteinInteraction.objects.bulk_create(bulk)
# return len(bulk)
def populate_pdi_chembl(self,dataset, update) -> int:
""" Populates the Protein-Drug-Interactions from Chembl
Handles Loading the data and passing it to the django database
Returns:
int: Count of how many interactions were added
"""
self.cache.init_proteins()
self.cache.init_drugs()
df = DataLoader.load_pdi_chembl()
bulk = set()
for _, row in df.iterrows():
try:
protein = self.cache.get_protein_by_uniprot(row['protein_ac'])
except KeyError:
# continue if not found
continue
try:
# try fetching drug
drug = self.cache.get_drug_by_drugbank(row['drug_id'])
except KeyError:
# continue if not found
continue
if not update or (self.cache.is_new_protein(protein) or self.cache.is_new_drug(drug)):
bulk.add(models.ProteinDrugInteraction(
pdi_dataset=dataset,
protein=protein,
drug=drug
))
models.ProteinDrugInteraction.objects.bulk_create(bulk)
return len(bulk)
# def populate_pdis_disgenet(self, dataset, update) -> int:
# """ Populates the Protein-Disorder-Interactions from DisGeNET
# Handles Loading the data and passing it to the django database
#
# Returns:
# int: Count of how many interactions were added
# """
# self.cache.init_proteins()
# self.cache.init_disorders()
#
# df = DataLoader.load_pdis_disgenet()
# bulk = set()
# for _, row in df.iterrows():
# try:
# # try fetching protein
# protein = self.cache.get_protein_by_uniprot(row['protein_name'])
# except KeyError:
# # continue if not found
# continue
# try:
# # try fetching disorder
# disorder = self.cache.get_disorder_by_mondo(row['disorder_name'])
# except KeyError:
# # continue if not found
# continue
# if not update or (self.cache.is_new_protein(protein) or self.cache.is_new_disease(disorder)):
# bulk.add(models.ProteinDisorderAssociation(
# pdis_dataset=dataset,
# protein=protein,
# disorder=disorder,
# score=row['score']
# ))
# models.ProteinDisorderAssociation.objects.bulk_create(bulk)
# return len(bulk)
def populate_drdis_drugbank(self, dataset, update) -> int:
""" Populates the Drug-Disorder-Indications from DrugBank
Handles Loading the data and passing it to the django database
Returns:
int: Count of how many edges were added
"""
self.cache.init_drugs()
self.cache.init_disorders()
df = DataLoader.load_drdis_drugbank()
bulk = set()
for _, row in df.iterrows():
try:
# try fetching protein
drug = self.cache.get_drug_by_drugbank(row['drugbank_id'])
except KeyError:
# continue if not found
continue
try:
# try fetching drug
disorder = self.cache.get_disorder_by_mondo(row['mondo_id'])
except KeyError:
# continue if not found
continue
if not update or (self.cache.is_new_drug(drug) or self.cache.is_new_disease(disorder)):
bulk.add(models.DrugDisorderIndication(
drdi_dataset=dataset,
drug=drug,
disorder=disorder,
))
models.DrugDisorderIndication.objects.bulk_create(bulk)
return len(bulk)
def populate_pdi_dgidb(self,dataset, update) -> int:
""" Populates the Protein-Drug-Interactions from DGIdb
Handles Loading the data and passing it to the django database
Returns:
int: Count of how many interactions were added
"""
self.cache.init_proteins()
self.cache.init_drugs()
df = DataLoader.load_pdi_dgidb()
bulk = set()
for _, row in df.iterrows():
try:
proteins = self.cache.get_proteins_by_entrez(row['entrez_id'])
except KeyError:
continue
try:
drug = self.cache.get_drug_by_drugbank(row['drug_id'])
except KeyError:
continue
for protein in proteins:
if not update or (self.cache.is_new_protein(protein) or self.cache.is_new_drug(drug)):
bulk.add(models.ProteinDrugInteraction(
pdi_dataset=dataset,
protein=protein,
drug=drug
))
models.ProteinDrugInteraction.objects.bulk_create(bulk)
return len(bulk)
# def populate_pdi_drugbank(self,dataset, update) -> int:
# """ Populates the Protein-Drug-Interactions from Drugbank
# Handles Loading the data and passing it to the django database
#
# Returns:
# int: Count of how many interactions were added
# """
# self.cache.init_proteins()
# self.cache.init_drugs()
#
# df = DataLoader.load_pdi_drugbank()
# bulk = set()
# for _, row in df.iterrows():
# try:
# proteins = self.cache.get_proteins_by_entrez(row['entrez_id'])
# except KeyError:
# continue
# try:
# drug = self.cache.get_drug_by_drugbank(row['drug_id'])
# except KeyError:
# continue
# for protein in proteins:
# if not update or (self.cache.is_new_protein(protein) or self.cache.is_new_drug(drug)):
# bulk.add(models.ProteinDrugInteraction(
# pdi_dataset=dataset,
# protein=protein,
# drug=drug
# ))
# models.ProteinDrugInteraction.objects.bulk_create(bulk)
# return len(bulk)