Skip to content
Snippets Groups Projects
Commit b51d5f07 authored by Malte Schokolowski's avatar Malte Schokolowski
Browse files

rekursives references handling und ein wenig refactoring

parent 42817b92
No related branches found
No related tags found
1 merge request!7Main
# -*- coding: utf-8 -*-
"""
Functions to generate a graph resembling citations between multiple JCIM ACS journals
Functions to generate a graph representing citations between multiple JCIM ACS journals
"""
......@@ -39,17 +39,13 @@ def initialize_nodes_list(doi_input_list):
# adds a node for every publication
# adds a node for every publication unknown
# adds edges for citations between publications
def create_graph_structure(pub, search_depth, search_depth_max):
def create_graph_structure_citations(pub, search_depth, search_depth_max):
for citation in pub._citations:
# Für jede citation, die in der entsprecheneden Klasseninstanz der Publikation gespeichert sind,
# wird geprüft, ob diese bereits als Knoten existiert.
# Wenn die citation noch nicht im Knoten-Array(nodes) existiert UND die maximale Tiefe
# noch nicht erreicht wurde, wird diese als Knoten im Knoten-Array gespeichert. Zusätzlich
# wird die Verbindung zur Publikation als Tupel im Kanten-Array(edges) gespeichert.
not_in_nodes = True
for node in nodes:
# checks every citation for duplication
if (citation.doi_url == node.doi_url):
not_in_nodes = False
break
......@@ -58,68 +54,121 @@ def create_graph_structure(pub, search_depth, search_depth_max):
nodes.append(citation)
edges.append([pub.doi_url,citation.doi_url])
# Wenn die citaion bereits im Knoten-Array existiert, wird nur die Verbindung zur Publikation
# als Tupel im Kanten-Array(edges) gespeichert.
# adds only edge if citation already exists
else:
edges.append([pub.doi_url,citation.doi_url])
# adds a node for every publication unknown
# adds edges for references between publications
def create_graph_structure_references(pub, search_height, search_height_max):
for reference in pub._references:
not_in_nodes = True
for node in nodes:
# checks every reference for duplication
if (reference.doi_url == node.doi_url):
not_in_nodes = False
break
if (not_in_nodes):
if (search_height <= search_height_max):
nodes.append(reference)
edges.append([reference.doi_url,pub.doi_url])
# adds only edge if citation already exists
else:
edges.append([reference.doi_url,pub.doi_url])
# recursive function to implement depth-first-search on citations
# doi_list: input list of dois
# search_depth: current search_depth of dfs
# search_depth_max: maximal search_depth for search_depth-first-search
def process_rec(doi_list, search_depth, search_depth_max):
# Die Tiefe wird bei jedem rekursiven Aufruf um 1 erhöht.
# doi_citations: input list of citet dois
# search_depth: current search_depth of depth-first-search
# search_depth_max: maximal search_depth for dfs
def process_citations_rec(doi_citations, search_depth, search_depth_max):
# depth of search is increased by 1 with each recursive call
search_depth += 1
# Für jede Publikation im Input-Array wird ein Klassenobjekt erstellt.
for pub_doi in doi_list:
# create class object for every citation from list
for pub_doi in doi_citations:
pub = input(pub_doi)
create_graph_structure(pub, search_depth, search_depth_max)
# Wenn die maximale Tiefe noch nicht erreicht wurde, werden alle citations aus der Publikation
# in ein Array geschrieben und mit diesem die Funktion erneut aufgerufen.
create_graph_structure_citations(pub, search_depth, search_depth_max)
# If the maximum depth has not yet been reached, all references from the publication
# are written to an array and the function is called again with this array.
if (search_depth < search_depth_max):
cit_list = []
citations_list = []
for citation in pub._citations:
# Momentan werden nur die citations mit acs in der URL gespeichert, da wir von anderen
# Quellen die Infotmationen nicht extrahieren können.
# currently only the references with acs are stored in the URL, because we can't
# extract the info from other sources.
if ("acs" in citation.doi_url):
cit_list.append(citation.doi_url)
citations_list.append(citation.doi_url)
# recursive call of function.
process_citations_rec(citations_list, search_depth, search_depth_max)
# Rekursiver Aufruf der Funktion.
process_rec(cit_list, search_depth, search_depth_max)
# recursive function to implement depth-first-search on references
# doi_references: input list of referenced dois
# search_height: current search_height of depth-first-search
# search_height_max: maximal search_height for dfs
def process_references_rec(doi_references, search_height, search_height_max):
# The height is increased by 1 with each recursive call
search_height += 1
# create class object for every citation from list
for pub_doi in doi_references:
pub = input(pub_doi)
create_graph_structure_references(pub, search_height, search_height_max)
# If the maximum height has not yet been reached, all references from the publication
# are written to an array and the function is called again with this array.
if (search_height < search_height_max):
references_list = []
for reference in pub._references:
# currently only the references with acs are stored in the URL, because we can't
# extract the info from other sources.
if ("acs" in reference.doi_url):
references_list.append(reference.doi_url)
# recursive call of function.
process_references_rec(references_list, search_height, search_height_max)
def process_main(doi_input_list, search_depth):
def process_main(doi_input_list, search_depth, search_height):
# ERROR-Handling doi_array = NULL
if (len(doi_input_list) == 0):
print("Error, no input data")
# ERROR- wenn für die Tiefe eine negative Zahl eingegeben wird
# ERROR- if a negative number is entered for depth
if (search_depth < 0):
print("Error, search_depth of search must be positive")
# Leeres Array für die Knoten(nodes) wird erstellt.
# Leeres Array für die Kanten(edges) wird erstellt.
# ERROR- if a negative number is entered for height
if (search_height < 0):
print("Error, search_height of search must be positive")
# create empty array for the nodes
# create empty array for the edges
global nodes, edges
nodes = []
edges = []
initialize_nodes_list(doi_input_list)
process_rec(doi_input_list, 0, search_depth)
process_citations_rec(doi_input_list, 0, search_depth)
process_references_rec(doi_input_list, 0, search_height)
output_to_json(nodes,edges)
# Nur fürs interne testen
# only for internal testing
return(nodes,edges)
# Programmtest, weil noch keine Verbindung zum Input besteht.
# program test, because there is no connection to the input yet.
def test_print():
arr = []
arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249')
......@@ -132,7 +181,7 @@ def test_print():
#url = sys.argv[1]
#arr.append[url]
nodes,edges = process_main(arr,1)
nodes,edges = process_main(arr,1,1)
print("Knoten:\n")
for node in nodes:
......@@ -140,4 +189,6 @@ def test_print():
print("\n Kanten:\n")
for edge in edges:
print(edge,"\n")
#test_print()
\ No newline at end of file
File added
File added
#!/usr/bin/env python3
"""
Functions for information retrieval of articles from the ACS journal JCIM
"""
__author__ = "Florian Jochens"
__email__ = "fj@andaco.de"
__status__ = "Production"
#__copyright__ = ""
#__credits__ = ["", "", "", ""]
#__license__ = ""
#__version__ = ""
#__maintainer__ = ""
from bs4 import BeautifulSoup as bs
import requests as req
import sys
from pathlib import Path
class Publication:
#_registry = []
_citations = []
def __init__(self, title, publication_date, contributors, doi_url,
subjects, num_citations):
#self._registry.append(self)
self.title = title
self.publication_date = publication_date
self.contributors = contributors
self.doi_url = doi_url
self.subjects = subjects
self.num_citations = num_citations
class Citation:
def __init__(self, title, journal, contributors, doi_url):
self.title = title
self.journal = journal
self.contributors = contributors
self.doi_url = doi_url
def get_article_info(soup):
header = soup.find('div', class_ = 'article_header-left pull-left')
article_title = header.find('span', class_ = 'hlFld-Title').text
publication_date = header.find('span', class_ = 'pub-date-value').text
for link in header.find('div', class_ = 'article_header-doiurl'):
doi_url = link.get('href')
subs = header.find('div', class_ = 'article_header-taxonomy')
subjects = []
for sub in subs.find_all('a'):
subjects.append(sub.get('title'))
cons = header.find('ul', class_ = 'loa')
contributors = []
for con in cons.find_all('span', class_ = 'hlFld-ContribAuthor'):
contributors.append(con.text)
numc = header.find('div', class_ = 'articleMetrics_count')
if not numc.a:
num_citations = 0
else:
num_citations = numc.a.text
pub = Publication(article_title, publication_date, contributors, doi_url,
subjects, num_citations)
return pub
def get_download_url():
export = soup.find('div', class_ = 'cit-download-dropdown_content')
url = 'https://pubs.acs.org'
for link in export.find_all('a'):
if link.get('title') == 'Citation and references':
url += link.get('href')
print(url)
return url
def download(url): # Download citation and references file
if url.find('='):
filename = url.rsplit('=', 1)[1]
path = Path(('./files/' + filename))
if path.is_file():
print("File already exists")
else:
print("File does not exist")
def get_citation_info(pub, num_citations, soup):
pub._citations = []
details = soup.find('ol', class_ = 'cited-content_cbyCitation')
titles = []
for title in details.find_all('span',
class_ = 'cited-content_cbyCitation_article-title'):
titles.append(title.text.replace('.', ''))
journal_names = []
for name in details.find_all('span',
class_ = 'cited-content_cbyCitation_journal-name'):
journal_names.append(name.text)
doi_urls = []
for url in details.find_all('a'):
doi_urls.append(url.get('href'))
contributors = []
for contrib in details.find_all('span',
class_ = 'cited-content_cbyCitation_article-contributors'):
contributors.append(contrib.text)
for i in range(0, int(num_citations)):
pub._citations.append(Citation(titles[i], journal_names[i],
contributors[i], doi_urls[i]))
def print_pub_info(pub):
print(f'''Article title: {pub.title}
Publication date: {pub.publication_date}
DOI-URL: {pub.doi_url}
Subjects:''')
print(*(pub.subjects), sep = ", ")
print('\nContributors:')
print(*(pub.contributors), sep = ", ")
if int(pub.num_citations) > 0:
if int(pub.num_citations) == 1:
print(f'\nThis publication is cited by the following publication:\n')
else:
print(f'\nThis publication is cited by the following {pub.num_citations} publications:\n')
for citation in pub._citations:
print(f'''
Title: {citation.title}
Journal: {citation.journal}
Contributors: {citation.contributors}
DOI-URL: {citation.doi_url}
''')
else:
print('\nThis publication is not cited by any other publication.')
def input(url):
html_text = req.get(url).text
soup = bs(html_text, 'html.parser')
pub = get_article_info(soup)
if int(pub.num_citations) > 0:
get_citation_info(pub, int(pub.num_citations), soup)
return pub
#if len(sys.argv) != 2:
# sys.stderr.write('Usage: {} <url>\n'.format(sys.argv[0]))
# exit(1)
#url = sys.argv[1]
#pub = input(url)
#print_pub_info(pub)
{"nodes": [{"name": "Comparing Molecular Patterns Using the Example of SMARTS: Applications and Filter Collection Analysis", "author": ["Emanuel S. R. Ehmki", "Robert Schmidt", "Farina Ohm", "Matthias Rarey"], "doi": "https://doi.org/10.1021/acs.jcim.9b00249"}, {"name": "Combining Machine Learning and Computational Chemistry for Predictive Insights Into Chemical Systems ", "author": "John A. Keith, Valentin Vassilev-Galindo, Bingqing Cheng, Stefan Chmiela, Michael Gastegger, Klaus-Robert M\u00fcller, Alexandre Tkatchenko. ", "doi": "https://doi.org/10.1021/acs.chemrev.1c00107"}, {"name": "Disconnected Maximum Common Substructures under Constraints ", "author": "Robert Schmidt, Florian Krull, Anna Lina Heinzke, Matthias Rarey. ", "doi": "https://doi.org/10.1021/acs.jcim.0c00741"}, {"name": "Evolution of Novartis\u2019 Small Molecule Screening Deck Design ", "author": "Ansgar Schuffenhauer, Nadine Schneider, Samuel Hintermann, Douglas Auld, Jutta Blank, Simona Cotesta, Caroline Engeloch, Nikolas Fechner, Christoph Gaul, Jerome Giovannoni, Johanna Jansen, John Joslin, Philipp Krastel, Eugen Lounkine, John Manchester, Lauren G. Monovich, Anna Paola Pelliccioli, Manuel Schwarze, Michael D. Shultz, Nikolaus Stiefl, Daniel K. Baeschlin. ", "doi": "https://doi.org/10.1021/acs.jmedchem.0c01332"}, {"name": "Comparing Molecular Patterns Using the Example of SMARTS: Theory and Algorithms ", "author": "Robert Schmidt, Emanuel S. R. Ehmki, Farina Ohm, Hans-Christian Ehrlich, Andriy Mashychev, Matthias Rarey. ", "doi": "https://doi.org/10.1021/acs.jcim.9b00250"}, {"name": "Machine learning accelerates quantum mechanics predictions of molecular crystals ", "author": "Yanqiang Han, Imran Ali, Zhilong Wang, Junfei Cai, Sicheng Wu, Jiequn Tang, Lin Zhang, Jiahao Ren, Rui Xiao, Qianqian Lu, Lei Hang, Hongyuan Luo, Jinjin Li. ", "doi": "https://doi.org/10.1016/j.physrep.2021.08.002"}, {"name": "The Growing Importance of Chirality in 3D Chemical Space Exploration and Modern Drug Discovery Approaches for Hit-ID ", "author": "Ilaria Proietti Silvestri, Paul J. J. Colbon. ", "doi": "https://doi.org/10.1021/acsmedchemlett.1c00251"}, {"name": "Target-Based Evaluation of \u201cDrug-Like\u201d Properties and Ligand Efficiencies ", "author": "Paul D. Leeson, A. Patricia Bento, Anna Gaulton, Anne Hersey, Emma J. Manners, Chris J. Radoux, Andrew R. Leach. ", "doi": "https://doi.org/10.1021/acs.jmedchem.1c00416"}, {"name": "BonMOLi\u00e8re: Small-Sized Libraries of Readily Purchasable Compounds, Optimized to Produce Genuine Hits in Biological Screens across the Protein Space ", "author": "Neann Mathai, Conrad Stork, Johannes Kirchmair. ", "doi": "https://doi.org/10.3390/ijms22157773"}, {"name": "Accelerating high-throughput virtual screening through molecular pool-based active learning ", "author": "David E. Graff, Eugene I. Shakhnovich, Connor W. Coley. ", "doi": "https://doi.org/10.1039/D0SC06805E"}, {"name": "Compound Screening ", "author": "Shin Numao, Gianluca Etienne, Goran Malojcic, Enrico Schmidt, Christoph E. Dumelin. ", "doi": "https://doi.org/10.1016/B978-0-12-820472-6.00078-5"}], "links": [{"source": "https://doi.org/10.1021/acs.jcim.9b00249", "target": "https://doi.org/10.1021/acs.chemrev.1c00107"}, {"source": "https://doi.org/10.1021/acs.jcim.9b00249", "target": "https://doi.org/10.1021/acs.jcim.0c00741"}, {"source": "https://doi.org/10.1021/acs.jcim.9b00249", "target": "https://doi.org/10.1021/acs.jmedchem.0c01332"}, {"source": "https://doi.org/10.1021/acs.jcim.9b00249", "target": "https://doi.org/10.1021/acs.jcim.9b00250"}, {"source": "https://doi.org/10.1021/acs.jcim.9b00249", "target": "https://doi.org/10.1016/j.physrep.2021.08.002"}, {"source": "https://doi.org/10.1021/acs.jmedchem.0c01332", "target": "https://doi.org/10.1021/acsmedchemlett.1c00251"}, {"source": "https://doi.org/10.1021/acs.jmedchem.0c01332", "target": "https://doi.org/10.1021/acs.jmedchem.1c00416"}, {"source": "https://doi.org/10.1021/acs.jmedchem.0c01332", "target": "https://doi.org/10.3390/ijms22157773"}, {"source": "https://doi.org/10.1021/acs.jmedchem.0c01332", "target": "https://doi.org/10.1039/D0SC06805E"}, {"source": "https://doi.org/10.1021/acs.jmedchem.0c01332", "target": "https://doi.org/10.1016/B978-0-12-820472-6.00078-5"}]}
\ No newline at end of file
{"nodes": [{"name": "Comparing Molecular Patterns Using the Example of SMARTS: Applications and Filter Collection Analysis", "author": ["Emanuel S. R. Ehmki", "Robert Schmidt", "Farina Ohm", "Matthias Rarey"], "doi": "https://doi.org/10.1021/acs.jcim.9b00249"}, {"name": "Combining Machine Learning and Computational Chemistry for Predictive Insights Into Chemical Systems ", "author": "John A. Keith, Valentin Vassilev-Galindo, Bingqing Cheng, Stefan Chmiela, Michael Gastegger, Klaus-Robert M\u00fcller, Alexandre Tkatchenko. ", "doi": "https://doi.org/10.1021/acs.chemrev.1c00107"}, {"name": "Disconnected Maximum Common Substructures under Constraints ", "author": "Robert Schmidt, Florian Krull, Anna Lina Heinzke, Matthias Rarey. ", "doi": "https://doi.org/10.1021/acs.jcim.0c00741"}, {"name": "Evolution of Novartis\u2019 Small Molecule Screening Deck Design ", "author": "Ansgar Schuffenhauer, Nadine Schneider, Samuel Hintermann, Douglas Auld, Jutta Blank, Simona Cotesta, Caroline Engeloch, Nikolas Fechner, Christoph Gaul, Jerome Giovannoni, Johanna Jansen, John Joslin, Philipp Krastel, Eugen Lounkine, John Manchester, Lauren G. Monovich, Anna Paola Pelliccioli, Manuel Schwarze, Michael D. Shultz, Nikolaus Stiefl, Daniel K. Baeschlin. ", "doi": "https://doi.org/10.1021/acs.jmedchem.0c01332"}, {"name": "Comparing Molecular Patterns Using the Example of SMARTS: Theory and Algorithms ", "author": "Robert Schmidt, Emanuel S. R. Ehmki, Farina Ohm, Hans-Christian Ehrlich, Andriy Mashychev, Matthias Rarey. ", "doi": "https://doi.org/10.1021/acs.jcim.9b00250"}, {"name": "Machine learning accelerates quantum mechanics predictions of molecular crystals ", "author": "Yanqiang Han, Imran Ali, Zhilong Wang, Junfei Cai, Sicheng Wu, Jiequn Tang, Lin Zhang, Jiahao Ren, Rui Xiao, Qianqian Lu, Lei Hang, Hongyuan Luo, Jinjin Li. ", "doi": "https://doi.org/10.1016/j.physrep.2021.08.002"}, {"name": "The Growing Importance of Chirality in 3D Chemical Space Exploration and Modern Drug Discovery Approaches for Hit-ID ", "author": "Ilaria Proietti Silvestri, Paul J. J. Colbon. ", "doi": "https://doi.org/10.1021/acsmedchemlett.1c00251"}, {"name": "Target-Based Evaluation of \u201cDrug-Like\u201d Properties and Ligand Efficiencies ", "author": "Paul D. Leeson, A. Patricia Bento, Anna Gaulton, Anne Hersey, Emma J. Manners, Chris J. Radoux, Andrew R. Leach. ", "doi": "https://doi.org/10.1021/acs.jmedchem.1c00416"}, {"name": "Fostering Research Synergies between Chemists in Swiss Academia and at Novartis ", "author": "Arndt Meyer, Daniel Baeschlin, Cara E. Brocklehurst, Myriam Duckely, Fabrice Gallou, Lucie E. Lovelle, Michael Parmentier, Thierry Schlama, Radka Snajdrova, Yves P. Auberson. ", "doi": "https://doi.org/10.2533/chimia.2021.936"}, {"name": "BonMOLi\u00e8re: Small-Sized Libraries of Readily Purchasable Compounds, Optimized to Produce Genuine Hits in Biological Screens across the Protein Space ", "author": "Neann Mathai, Conrad Stork, Johannes Kirchmair. ", "doi": "https://doi.org/10.3390/ijms22157773"}, {"name": "Accelerating high-throughput virtual screening through molecular pool-based active learning ", "author": "David E. Graff, Eugene I. Shakhnovich, Connor W. Coley. ", "doi": "https://doi.org/10.1039/D0SC06805E"}, {"name": "Compound Screening ", "author": "Shin Numao, Gianluca Etienne, Goran Malojcic, Enrico Schmidt, Christoph E. Dumelin. ", "doi": "https://doi.org/10.1016/B978-0-12-820472-6.00078-5"}], "links": [{"source": "https://doi.org/10.1021/acs.jcim.9b00249", "target": "https://doi.org/10.1021/acs.chemrev.1c00107"}, {"source": "https://doi.org/10.1021/acs.jcim.9b00249", "target": "https://doi.org/10.1021/acs.jcim.0c00741"}, {"source": "https://doi.org/10.1021/acs.jcim.9b00249", "target": "https://doi.org/10.1021/acs.jmedchem.0c01332"}, {"source": "https://doi.org/10.1021/acs.jcim.9b00249", "target": "https://doi.org/10.1021/acs.jcim.9b00250"}, {"source": "https://doi.org/10.1021/acs.jcim.9b00249", "target": "https://doi.org/10.1016/j.physrep.2021.08.002"}, {"source": "https://doi.org/10.1021/acs.jmedchem.0c01332", "target": "https://doi.org/10.1021/acsmedchemlett.1c00251"}, {"source": "https://doi.org/10.1021/acs.jmedchem.0c01332", "target": "https://doi.org/10.1021/acs.jmedchem.1c00416"}, {"source": "https://doi.org/10.1021/acs.jmedchem.0c01332", "target": "https://doi.org/10.2533/chimia.2021.936"}, {"source": "https://doi.org/10.1021/acs.jmedchem.0c01332", "target": "https://doi.org/10.3390/ijms22157773"}, {"source": "https://doi.org/10.1021/acs.jmedchem.0c01332", "target": "https://doi.org/10.1039/D0SC06805E"}, {"source": "https://doi.org/10.1021/acs.jmedchem.0c01332", "target": "https://doi.org/10.1016/B978-0-12-820472-6.00078-5"}]}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment