diff --git a/verarbeitung/Processing.py b/verarbeitung/Processing.py index cdde51f5842fe5351ee2ec83c3ceac89f250b3ee..872fa129f92d12292995c39c7a4a02e03f695566 100644 --- a/verarbeitung/Processing.py +++ b/verarbeitung/Processing.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- """ -Functions to generate a graph resembling citations between multiple JCIM ACS journals +Functions to generate a graph representing citations between multiple JCIM ACS journals """ @@ -39,17 +39,13 @@ def initialize_nodes_list(doi_input_list): -# adds a node for every publication +# adds a node for every publication unknown # adds edges for citations between publications -def create_graph_structure(pub, search_depth, search_depth_max): +def create_graph_structure_citations(pub, search_depth, search_depth_max): for citation in pub._citations: - # Für jede citation, die in der entsprecheneden Klasseninstanz der Publikation gespeichert sind, - # wird geprüft, ob diese bereits als Knoten existiert. - # Wenn die citation noch nicht im Knoten-Array(nodes) existiert UND die maximale Tiefe - # noch nicht erreicht wurde, wird diese als Knoten im Knoten-Array gespeichert. Zusätzlich - # wird die Verbindung zur Publikation als Tupel im Kanten-Array(edges) gespeichert. not_in_nodes = True for node in nodes: + # checks every citation for duplication if (citation.doi_url == node.doi_url): not_in_nodes = False break @@ -58,68 +54,121 @@ def create_graph_structure(pub, search_depth, search_depth_max): nodes.append(citation) edges.append([pub.doi_url,citation.doi_url]) - # Wenn die citaion bereits im Knoten-Array existiert, wird nur die Verbindung zur Publikation - # als Tupel im Kanten-Array(edges) gespeichert. + # adds only edge if citation already exists else: edges.append([pub.doi_url,citation.doi_url]) + + + +# adds a node for every publication unknown +# adds edges for references between publications +def create_graph_structure_references(pub, search_height, search_height_max): + for reference in pub._references: + not_in_nodes = True + for node in nodes: + # checks every reference for duplication + if (reference.doi_url == node.doi_url): + not_in_nodes = False + break + if (not_in_nodes): + if (search_height <= search_height_max): + nodes.append(reference) + edges.append([reference.doi_url,pub.doi_url]) + + # adds only edge if citation already exists + else: + edges.append([reference.doi_url,pub.doi_url]) - + + # recursive function to implement depth-first-search on citations -# doi_list: input list of dois -# search_depth: current search_depth of dfs -# search_depth_max: maximal search_depth for search_depth-first-search -def process_rec(doi_list, search_depth, search_depth_max): - # Die Tiefe wird bei jedem rekursiven Aufruf um 1 erhöht. +# doi_citations: input list of citet dois +# search_depth: current search_depth of depth-first-search +# search_depth_max: maximal search_depth for dfs +def process_citations_rec(doi_citations, search_depth, search_depth_max): + # depth of search is increased by 1 with each recursive call search_depth += 1 - # Für jede Publikation im Input-Array wird ein Klassenobjekt erstellt. - for pub_doi in doi_list: + # create class object for every citation from list + for pub_doi in doi_citations: pub = input(pub_doi) - create_graph_structure(pub, search_depth, search_depth_max) - # Wenn die maximale Tiefe noch nicht erreicht wurde, werden alle citations aus der Publikation - # in ein Array geschrieben und mit diesem die Funktion erneut aufgerufen. + create_graph_structure_citations(pub, search_depth, search_depth_max) + # If the maximum depth has not yet been reached, all references from the publication + # are written to an array and the function is called again with this array. if (search_depth < search_depth_max): - cit_list = [] + citations_list = [] for citation in pub._citations: - # Momentan werden nur die citations mit acs in der URL gespeichert, da wir von anderen - # Quellen die Infotmationen nicht extrahieren können. + # currently only the references with acs are stored in the URL, because we can't + # extract the info from other sources. if ("acs" in citation.doi_url): - cit_list.append(citation.doi_url) + citations_list.append(citation.doi_url) + + # recursive call of function. + process_citations_rec(citations_list, search_depth, search_depth_max) + - # Rekursiver Aufruf der Funktion. - process_rec(cit_list, search_depth, search_depth_max) + +# recursive function to implement depth-first-search on references +# doi_references: input list of referenced dois +# search_height: current search_height of depth-first-search +# search_height_max: maximal search_height for dfs +def process_references_rec(doi_references, search_height, search_height_max): + # The height is increased by 1 with each recursive call + search_height += 1 + + # create class object for every citation from list + for pub_doi in doi_references: + pub = input(pub_doi) + create_graph_structure_references(pub, search_height, search_height_max) + # If the maximum height has not yet been reached, all references from the publication + # are written to an array and the function is called again with this array. + if (search_height < search_height_max): + references_list = [] + for reference in pub._references: + + # currently only the references with acs are stored in the URL, because we can't + # extract the info from other sources. + if ("acs" in reference.doi_url): + references_list.append(reference.doi_url) + + # recursive call of function. + process_references_rec(references_list, search_height, search_height_max) -def process_main(doi_input_list, search_depth): +def process_main(doi_input_list, search_depth, search_height): # ERROR-Handling doi_array = NULL if (len(doi_input_list) == 0): print("Error, no input data") - # ERROR- wenn für die Tiefe eine negative Zahl eingegeben wird + # ERROR- if a negative number is entered for depth if (search_depth < 0): print("Error, search_depth of search must be positive") - - # Leeres Array für die Knoten(nodes) wird erstellt. - # Leeres Array für die Kanten(edges) wird erstellt. + # ERROR- if a negative number is entered for height + if (search_height < 0): + print("Error, search_height of search must be positive") + + # create empty array for the nodes + # create empty array for the edges global nodes, edges nodes = [] edges = [] initialize_nodes_list(doi_input_list) - process_rec(doi_input_list, 0, search_depth) + process_citations_rec(doi_input_list, 0, search_depth) + process_references_rec(doi_input_list, 0, search_height) output_to_json(nodes,edges) - # Nur fürs interne testen + # only for internal testing return(nodes,edges) -# Programmtest, weil noch keine Verbindung zum Input besteht. +# program test, because there is no connection to the input yet. def test_print(): arr = [] arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249') @@ -132,7 +181,7 @@ def test_print(): #url = sys.argv[1] #arr.append[url] - nodes,edges = process_main(arr,1) + nodes,edges = process_main(arr,1,1) print("Knoten:\n") for node in nodes: @@ -140,4 +189,6 @@ def test_print(): print("\n Kanten:\n") for edge in edges: print(edge,"\n") + +#test_print() \ No newline at end of file diff --git a/verarbeitung/__pycache__/input_fj.cpython-39.pyc b/verarbeitung/__pycache__/input_fj.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..175f9ebbfdf5f3313196b4f10aa01dc2e8e20509 Binary files /dev/null and b/verarbeitung/__pycache__/input_fj.cpython-39.pyc differ diff --git a/verarbeitung/__pycache__/json_demo.cpython-39.pyc b/verarbeitung/__pycache__/json_demo.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..29c2c14b5229d9768c2a273f8e1adba6cfcfc63f Binary files /dev/null and b/verarbeitung/__pycache__/json_demo.cpython-39.pyc differ diff --git a/verarbeitung/input_fj.py b/verarbeitung/input_fj.py new file mode 100644 index 0000000000000000000000000000000000000000..00bb0126e2ae1abf6563bf99a16cc585b6d88077 --- /dev/null +++ b/verarbeitung/input_fj.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +""" +Functions for information retrieval of articles from the ACS journal JCIM + +""" + +__author__ = "Florian Jochens" +__email__ = "fj@andaco.de" +__status__ = "Production" +#__copyright__ = "" +#__credits__ = ["", "", "", ""] +#__license__ = "" +#__version__ = "" +#__maintainer__ = "" + +from bs4 import BeautifulSoup as bs +import requests as req +import sys +from pathlib import Path + +class Publication: + #_registry = [] + _citations = [] + + def __init__(self, title, publication_date, contributors, doi_url, + subjects, num_citations): + #self._registry.append(self) + self.title = title + self.publication_date = publication_date + self.contributors = contributors + self.doi_url = doi_url + self.subjects = subjects + self.num_citations = num_citations + +class Citation: + def __init__(self, title, journal, contributors, doi_url): + self.title = title + self.journal = journal + self.contributors = contributors + self.doi_url = doi_url + +def get_article_info(soup): + header = soup.find('div', class_ = 'article_header-left pull-left') + article_title = header.find('span', class_ = 'hlFld-Title').text + publication_date = header.find('span', class_ = 'pub-date-value').text + for link in header.find('div', class_ = 'article_header-doiurl'): + doi_url = link.get('href') + subs = header.find('div', class_ = 'article_header-taxonomy') + subjects = [] + for sub in subs.find_all('a'): + subjects.append(sub.get('title')) + cons = header.find('ul', class_ = 'loa') + contributors = [] + for con in cons.find_all('span', class_ = 'hlFld-ContribAuthor'): + contributors.append(con.text) + numc = header.find('div', class_ = 'articleMetrics_count') + if not numc.a: + num_citations = 0 + else: + num_citations = numc.a.text + + pub = Publication(article_title, publication_date, contributors, doi_url, + subjects, num_citations) + return pub + +def get_download_url(): + export = soup.find('div', class_ = 'cit-download-dropdown_content') + url = 'https://pubs.acs.org' + for link in export.find_all('a'): + if link.get('title') == 'Citation and references': + url += link.get('href') + print(url) + return url + +def download(url): # Download citation and references file + if url.find('='): + filename = url.rsplit('=', 1)[1] + path = Path(('./files/' + filename)) + if path.is_file(): + print("File already exists") + else: + print("File does not exist") + +def get_citation_info(pub, num_citations, soup): + pub._citations = [] + details = soup.find('ol', class_ = 'cited-content_cbyCitation') + titles = [] + for title in details.find_all('span', + class_ = 'cited-content_cbyCitation_article-title'): + titles.append(title.text.replace('.', '')) + journal_names = [] + for name in details.find_all('span', + class_ = 'cited-content_cbyCitation_journal-name'): + journal_names.append(name.text) + doi_urls = [] + for url in details.find_all('a'): + doi_urls.append(url.get('href')) + contributors = [] + for contrib in details.find_all('span', + class_ = 'cited-content_cbyCitation_article-contributors'): + contributors.append(contrib.text) + for i in range(0, int(num_citations)): + pub._citations.append(Citation(titles[i], journal_names[i], + contributors[i], doi_urls[i])) +def print_pub_info(pub): + print(f'''Article title: {pub.title} +Publication date: {pub.publication_date} +DOI-URL: {pub.doi_url} + +Subjects:''') + print(*(pub.subjects), sep = ", ") + print('\nContributors:') + print(*(pub.contributors), sep = ", ") + + if int(pub.num_citations) > 0: + if int(pub.num_citations) == 1: + print(f'\nThis publication is cited by the following publication:\n') + else: + print(f'\nThis publication is cited by the following {pub.num_citations} publications:\n') + for citation in pub._citations: + print(f''' + Title: {citation.title} + Journal: {citation.journal} + Contributors: {citation.contributors} + DOI-URL: {citation.doi_url} + ''') + else: + print('\nThis publication is not cited by any other publication.') + +def input(url): + html_text = req.get(url).text + soup = bs(html_text, 'html.parser') + + pub = get_article_info(soup) + if int(pub.num_citations) > 0: + get_citation_info(pub, int(pub.num_citations), soup) + return pub + +#if len(sys.argv) != 2: +# sys.stderr.write('Usage: {} <url>\n'.format(sys.argv[0])) +# exit(1) +#url = sys.argv[1] +#pub = input(url) +#print_pub_info(pub) diff --git a/verarbeitung/json_text.json b/verarbeitung/json_text.json index 41455139941f560a6da139fae4c185ac19acdee0..2e6c8fa59091aa94ac7c0f6ec5abf1e19d329b31 100644 --- a/verarbeitung/json_text.json +++ b/verarbeitung/json_text.json @@ -1 +1 @@ -{"nodes": [{"name": "Comparing Molecular Patterns Using the Example of SMARTS: Applications and Filter Collection Analysis", "author": ["Emanuel S. R. Ehmki", "Robert Schmidt", "Farina Ohm", "Matthias Rarey"], "doi": "https://doi.org/10.1021/acs.jcim.9b00249"}, {"name": "Combining Machine Learning and Computational Chemistry for Predictive Insights Into Chemical Systems ", "author": "John A. Keith, Valentin Vassilev-Galindo, Bingqing Cheng, Stefan Chmiela, Michael Gastegger, Klaus-Robert M\u00fcller, Alexandre Tkatchenko. ", "doi": "https://doi.org/10.1021/acs.chemrev.1c00107"}, {"name": "Disconnected Maximum Common Substructures under Constraints ", "author": "Robert Schmidt, Florian Krull, Anna Lina Heinzke, Matthias Rarey. ", "doi": "https://doi.org/10.1021/acs.jcim.0c00741"}, {"name": "Evolution of Novartis\u2019 Small Molecule Screening Deck Design ", "author": "Ansgar Schuffenhauer, Nadine Schneider, Samuel Hintermann, Douglas Auld, Jutta Blank, Simona Cotesta, Caroline Engeloch, Nikolas Fechner, Christoph Gaul, Jerome Giovannoni, Johanna Jansen, John Joslin, Philipp Krastel, Eugen Lounkine, John Manchester, Lauren G. Monovich, Anna Paola Pelliccioli, Manuel Schwarze, Michael D. Shultz, Nikolaus Stiefl, Daniel K. Baeschlin. ", "doi": "https://doi.org/10.1021/acs.jmedchem.0c01332"}, {"name": "Comparing Molecular Patterns Using the Example of SMARTS: Theory and Algorithms ", "author": "Robert Schmidt, Emanuel S. R. Ehmki, Farina Ohm, Hans-Christian Ehrlich, Andriy Mashychev, Matthias Rarey. ", "doi": "https://doi.org/10.1021/acs.jcim.9b00250"}, {"name": "Machine learning accelerates quantum mechanics predictions of molecular crystals ", "author": "Yanqiang Han, Imran Ali, Zhilong Wang, Junfei Cai, Sicheng Wu, Jiequn Tang, Lin Zhang, Jiahao Ren, Rui Xiao, Qianqian Lu, Lei Hang, Hongyuan Luo, Jinjin Li. ", "doi": "https://doi.org/10.1016/j.physrep.2021.08.002"}, {"name": "The Growing Importance of Chirality in 3D Chemical Space Exploration and Modern Drug Discovery Approaches for Hit-ID ", "author": "Ilaria Proietti Silvestri, Paul J. J. Colbon. ", "doi": "https://doi.org/10.1021/acsmedchemlett.1c00251"}, {"name": "Target-Based Evaluation of \u201cDrug-Like\u201d Properties and Ligand Efficiencies ", "author": "Paul D. Leeson, A. Patricia Bento, Anna Gaulton, Anne Hersey, Emma J. Manners, Chris J. Radoux, Andrew R. Leach. ", "doi": "https://doi.org/10.1021/acs.jmedchem.1c00416"}, {"name": "BonMOLi\u00e8re: Small-Sized Libraries of Readily Purchasable Compounds, Optimized to Produce Genuine Hits in Biological Screens across the Protein Space ", "author": "Neann Mathai, Conrad Stork, Johannes Kirchmair. ", "doi": "https://doi.org/10.3390/ijms22157773"}, {"name": "Accelerating high-throughput virtual screening through molecular pool-based active learning ", "author": "David E. Graff, Eugene I. Shakhnovich, Connor W. Coley. ", "doi": "https://doi.org/10.1039/D0SC06805E"}, {"name": "Compound Screening ", "author": "Shin Numao, Gianluca Etienne, Goran Malojcic, Enrico Schmidt, Christoph E. Dumelin. ", "doi": "https://doi.org/10.1016/B978-0-12-820472-6.00078-5"}], "links": [{"source": "https://doi.org/10.1021/acs.jcim.9b00249", "target": "https://doi.org/10.1021/acs.chemrev.1c00107"}, {"source": "https://doi.org/10.1021/acs.jcim.9b00249", "target": "https://doi.org/10.1021/acs.jcim.0c00741"}, {"source": "https://doi.org/10.1021/acs.jcim.9b00249", "target": "https://doi.org/10.1021/acs.jmedchem.0c01332"}, {"source": "https://doi.org/10.1021/acs.jcim.9b00249", "target": "https://doi.org/10.1021/acs.jcim.9b00250"}, {"source": "https://doi.org/10.1021/acs.jcim.9b00249", "target": "https://doi.org/10.1016/j.physrep.2021.08.002"}, {"source": "https://doi.org/10.1021/acs.jmedchem.0c01332", "target": "https://doi.org/10.1021/acsmedchemlett.1c00251"}, {"source": "https://doi.org/10.1021/acs.jmedchem.0c01332", "target": "https://doi.org/10.1021/acs.jmedchem.1c00416"}, {"source": "https://doi.org/10.1021/acs.jmedchem.0c01332", "target": "https://doi.org/10.3390/ijms22157773"}, {"source": "https://doi.org/10.1021/acs.jmedchem.0c01332", "target": "https://doi.org/10.1039/D0SC06805E"}, {"source": "https://doi.org/10.1021/acs.jmedchem.0c01332", "target": "https://doi.org/10.1016/B978-0-12-820472-6.00078-5"}]} \ No newline at end of file +{"nodes": [{"name": "Comparing Molecular Patterns Using the Example of SMARTS: Applications and Filter Collection Analysis", "author": ["Emanuel S. R. Ehmki", "Robert Schmidt", "Farina Ohm", "Matthias Rarey"], "doi": "https://doi.org/10.1021/acs.jcim.9b00249"}, {"name": "Combining Machine Learning and Computational Chemistry for Predictive Insights Into Chemical Systems ", "author": "John A. Keith, Valentin Vassilev-Galindo, Bingqing Cheng, Stefan Chmiela, Michael Gastegger, Klaus-Robert M\u00fcller, Alexandre Tkatchenko. ", "doi": "https://doi.org/10.1021/acs.chemrev.1c00107"}, {"name": "Disconnected Maximum Common Substructures under Constraints ", "author": "Robert Schmidt, Florian Krull, Anna Lina Heinzke, Matthias Rarey. ", "doi": "https://doi.org/10.1021/acs.jcim.0c00741"}, {"name": "Evolution of Novartis\u2019 Small Molecule Screening Deck Design ", "author": "Ansgar Schuffenhauer, Nadine Schneider, Samuel Hintermann, Douglas Auld, Jutta Blank, Simona Cotesta, Caroline Engeloch, Nikolas Fechner, Christoph Gaul, Jerome Giovannoni, Johanna Jansen, John Joslin, Philipp Krastel, Eugen Lounkine, John Manchester, Lauren G. Monovich, Anna Paola Pelliccioli, Manuel Schwarze, Michael D. Shultz, Nikolaus Stiefl, Daniel K. Baeschlin. ", "doi": "https://doi.org/10.1021/acs.jmedchem.0c01332"}, {"name": "Comparing Molecular Patterns Using the Example of SMARTS: Theory and Algorithms ", "author": "Robert Schmidt, Emanuel S. R. Ehmki, Farina Ohm, Hans-Christian Ehrlich, Andriy Mashychev, Matthias Rarey. ", "doi": "https://doi.org/10.1021/acs.jcim.9b00250"}, {"name": "Machine learning accelerates quantum mechanics predictions of molecular crystals ", "author": "Yanqiang Han, Imran Ali, Zhilong Wang, Junfei Cai, Sicheng Wu, Jiequn Tang, Lin Zhang, Jiahao Ren, Rui Xiao, Qianqian Lu, Lei Hang, Hongyuan Luo, Jinjin Li. ", "doi": "https://doi.org/10.1016/j.physrep.2021.08.002"}, {"name": "The Growing Importance of Chirality in 3D Chemical Space Exploration and Modern Drug Discovery Approaches for Hit-ID ", "author": "Ilaria Proietti Silvestri, Paul J. J. Colbon. ", "doi": "https://doi.org/10.1021/acsmedchemlett.1c00251"}, {"name": "Target-Based Evaluation of \u201cDrug-Like\u201d Properties and Ligand Efficiencies ", "author": "Paul D. Leeson, A. Patricia Bento, Anna Gaulton, Anne Hersey, Emma J. Manners, Chris J. Radoux, Andrew R. Leach. ", "doi": "https://doi.org/10.1021/acs.jmedchem.1c00416"}, {"name": "Fostering Research Synergies between Chemists in Swiss Academia and at Novartis ", "author": "Arndt Meyer, Daniel Baeschlin, Cara E. Brocklehurst, Myriam Duckely, Fabrice Gallou, Lucie E. Lovelle, Michael Parmentier, Thierry Schlama, Radka Snajdrova, Yves P. Auberson. ", "doi": "https://doi.org/10.2533/chimia.2021.936"}, {"name": "BonMOLi\u00e8re: Small-Sized Libraries of Readily Purchasable Compounds, Optimized to Produce Genuine Hits in Biological Screens across the Protein Space ", "author": "Neann Mathai, Conrad Stork, Johannes Kirchmair. ", "doi": "https://doi.org/10.3390/ijms22157773"}, {"name": "Accelerating high-throughput virtual screening through molecular pool-based active learning ", "author": "David E. Graff, Eugene I. Shakhnovich, Connor W. Coley. ", "doi": "https://doi.org/10.1039/D0SC06805E"}, {"name": "Compound Screening ", "author": "Shin Numao, Gianluca Etienne, Goran Malojcic, Enrico Schmidt, Christoph E. Dumelin. ", "doi": "https://doi.org/10.1016/B978-0-12-820472-6.00078-5"}], "links": [{"source": "https://doi.org/10.1021/acs.jcim.9b00249", "target": "https://doi.org/10.1021/acs.chemrev.1c00107"}, {"source": "https://doi.org/10.1021/acs.jcim.9b00249", "target": "https://doi.org/10.1021/acs.jcim.0c00741"}, {"source": "https://doi.org/10.1021/acs.jcim.9b00249", "target": "https://doi.org/10.1021/acs.jmedchem.0c01332"}, {"source": "https://doi.org/10.1021/acs.jcim.9b00249", "target": "https://doi.org/10.1021/acs.jcim.9b00250"}, {"source": "https://doi.org/10.1021/acs.jcim.9b00249", "target": "https://doi.org/10.1016/j.physrep.2021.08.002"}, {"source": "https://doi.org/10.1021/acs.jmedchem.0c01332", "target": "https://doi.org/10.1021/acsmedchemlett.1c00251"}, {"source": "https://doi.org/10.1021/acs.jmedchem.0c01332", "target": "https://doi.org/10.1021/acs.jmedchem.1c00416"}, {"source": "https://doi.org/10.1021/acs.jmedchem.0c01332", "target": "https://doi.org/10.2533/chimia.2021.936"}, {"source": "https://doi.org/10.1021/acs.jmedchem.0c01332", "target": "https://doi.org/10.3390/ijms22157773"}, {"source": "https://doi.org/10.1021/acs.jmedchem.0c01332", "target": "https://doi.org/10.1039/D0SC06805E"}, {"source": "https://doi.org/10.1021/acs.jmedchem.0c01332", "target": "https://doi.org/10.1016/B978-0-12-820472-6.00078-5"}]} \ No newline at end of file