diff --git "a/verarbeitung/Processing_test_doi_\303\274berarbeitet.py" b/verarbeitung/Processing.py similarity index 77% rename from "verarbeitung/Processing_test_doi_\303\274berarbeitet.py" rename to verarbeitung/Processing.py index ac6ce63dc512a4e53303ad75f74f04bf6221e191..bfa533a3161063b7ec82231cbc2c34950336eb11 100644 --- "a/verarbeitung/Processing_test_doi_\303\274berarbeitet.py" +++ b/verarbeitung/Processing.py @@ -10,12 +10,13 @@ import requests as req import sys from pathlib import Path from input_fj import input +from json_demo import output_to_json -def process_main(array, depth): +def process_main(doi_input_array, depth): # ERROR-Handling doi_array = NULL - if (len(array) == 0): + if (len(doi_input_array) == 0): print("Error, no input data") # ERROR- wenn für die Tiefe eine negative Zahl eingegeben wird @@ -30,14 +31,21 @@ def process_main(array, depth): edges = [] # Jede Publikation aus dem Input-Array wird in den Knoten-Array(nodes) eingefügt. - for pub in array: - if (pub not in nodes): + for pub_doi in doi_input_array: + pub = input(pub_doi) + not_in_nodes = True + for node in nodes: + if (pub.doi_url == node.doi_url): + not_in_nodes = False + break + if (not_in_nodes): nodes.append(pub) else: - array.remove(pub) + doi_input_array.remove(pub_doi) - process_rec_depth(array, 0, depth) + process_rec_depth(doi_input_array, 0, depth) + output_to_json(nodes,edges) return(nodes,edges) @@ -56,9 +64,14 @@ def process_rec_depth(array, depth, depth_max): # Wenn die citation noch nicht im Knoten-Array(nodes) existiert UND die maximale Tiefe # noch nicht erreicht wurde, wird diese als Knoten im Knoten-Array gespeichert. Zusätzlich # wird die Verbindung zur Publikation als Tupel im Kanten-Array(edges) gespeichert. - if (citation.doi_url not in nodes): + not_in_nodes = True + for node in nodes: + if (citation.doi_url == node.doi_url): + not_in_nodes = False + break + if (not_in_nodes): if (depth <= depth_max): - nodes.append(citation.doi_url) + nodes.append(citation) edges.append([pub.doi_url,citation.doi_url]) # Wenn die citaion bereits im Knoten-Array existiert, wird nur die Verbindung zur Publikation @@ -85,8 +98,8 @@ def process_rec_depth(array, depth, depth_max): # Programmtest, weil noch keine Verbindung zum Input besteht. arr = [] arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249') -#arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249') -#arr.append('https://doi.org/10.1021/acs.jmedchem.0c01332') +arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249') +arr.append('https://doi.org/10.1021/acs.jmedchem.0c01332') #arr.append('https://doi.org/10.1021/acs.jcim.0c00741') #arr.append('https://doi.org/10.1021/ci700007b') @@ -97,8 +110,8 @@ arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249') nodes,edges = process_main(arr,1) print("Knoten:\n") -for vortex in nodes: - print(vortex, "\n") +for node in nodes: + print(node.title, "\n") print("\nKanten:\n") for edge in edges: print(edge,"\n") \ No newline at end of file diff --git a/verarbeitung/Processing_test.py b/verarbeitung/Processing_test.py deleted file mode 100644 index 83631bffd92012e7dec57079cee9755fc673a295..0000000000000000000000000000000000000000 --- a/verarbeitung/Processing_test.py +++ /dev/null @@ -1,101 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Wed Nov 3 16:54:43 2021 - -@author: Malte Schokolowski -""" - -from bs4 import BeautifulSoup as bs -import requests as req -import sys -from pathlib import Path -import input_test as inp - - -def process_main(array, depth): - #ERROR-Handling doi_array = NULL, Tiefe < 0 oder 1 ??? + - if (depth < 0): - print("Error, depth of search must be positive") - - - - # leeres Array für die Knoten wird erstellt - # leeres Array für die Kanten wird erstellt - global V, E - V = [] - E = [] - - # Füge in Knoten-Array alle Starterknoten ein - for pub_doi in array: - pub = inp.input(pub_doi) - V.append(pub) - #print("\n") - process_rec(array, 0, depth) - return(V,E) - - -def process_rec(array, depth, depth_max): - depth += 1 - for pub_doi in array: - # Input aufrufen und speichern - pub = inp.input(pub_doi) - - # Klasseninstanz bestehend aus u.a. - # Name, Autoren, DOI, Jahr, - # was_wir_zitiert_haben, wo_wir_zitiert_wurden - for citation in pub._citations: - #print(pub.doi_url, ".\t", citation.doi_url, "\n") - #Knoten j erstellen, wenn noch unbekannt - if (citation not in V): - if (depth <= depth_max): - V.append(citation) - #print(citation.doi_url, "\n") - E.append([pub,citation]) - #print(pub.doi_url, ".\t", citation.doi_url, "\n") - - else: - E.append([pub,citation]) # Kante erstellen, wenn citation bekannt, also wenn beide im Input sind oder bei Zyklus - #print(pub.doi_url, ".\t", citation.doi_url, "\n") - - - #for k in wo_wir_zitiert_wurden: - #if (i != k): - #Knoten k erstellen, wenn noch unbekannt - #Kante erstellen von k nach i - - if (depth < depth_max): - cit_arr = [] - for citation in pub._citations: - if ("acs" in citation.doi_url): - cit_arr.append(citation.doi_url) - process_rec(cit_arr, depth, depth_max) - - - - # Knoten- und Kantenmenge zurückgeben - # {1,2,3,4,5} oder - # {{1="paper1",0}, {2 = "paper2"},1} oder - # {1="paper1", 2 = "paper2"} - # {(1,2),(2,3),(2,4)} - - -arr = [] -arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249') -arr.append('https://doi.org/10.1021/acs.jmedchem.0c01332') -#arr.append('https://doi.org/10.1021/acs.jcim.0c00741') -#arr.append('https://doi.org/10.1021/acs.accounts.1c00440') - -#arr.append('https://doi.org/10.1021/ci700007b') -#arr.append('https://doi.org/10.1021/acs.jcim.5b00292') -#url = sys.argv[1] -#arr.append[url] - -V,E = process_main(arr,1) - -for vortex in V: - #print(vortex, "\n") - print(vortex.doi_url, "\n") -print("\n") -for i in range(len(E)): - #print(edge,"\n") - print(E[i][0].doi_url, ", ",E[i][1].doi_url, "\n") \ No newline at end of file diff --git a/verarbeitung/Processing_test_doi.py b/verarbeitung/Processing_test_doi.py deleted file mode 100644 index 898ab1a0265a1d7e1ef8f51a99cb4be1bc33ad87..0000000000000000000000000000000000000000 --- a/verarbeitung/Processing_test_doi.py +++ /dev/null @@ -1,104 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Wed Nov 3 16:54:43 2021 - -@author: Malte Schokolowski -""" - -from bs4 import BeautifulSoup as bs -import requests as req -import sys -from pathlib import Path -import input_test as inp - - - -def process_main(array, depth): - #ERROR-Handling doi_array = NULL, Tiefe < 0 oder 1 ??? + - - if (depth < 0): - print("Error, depth of search must be positive") - - - - # leeres Array für die Knoten wird erstellt - # leeres Array für die Kanten wird erstellt - global V, E - V = [] - E = [] - - # Füge in Knoten-Array alle Starterknoten ein - for pub in array: - V.append(pub) - #print("\n") - process_rec(array, 0, depth) - return(V,E) - - -def process_rec(array, depth, depth_max): - depth += 1 - for pub_doi in array: - # Input aufrufen und speichern - #print(pub_doi) - pub = inp.input(pub_doi) - #for cit in pub._citations: - #print(pub.doi_url, cit.doi_url) - # Klasseninstanz bestehend aus u.a. - # Name, Autoren, DOI, Jahr, - # was_wir_zitiert_haben, wo_wir_zitiert_wurden - - for citation in pub._citations: - #print(pub.doi_url, ".\t", citation.doi_url, "\n") - #Knoten j erstellen, wenn noch unbekannt - if (citation.doi_url not in V): - if (depth <= depth_max): - V.append(citation.doi_url) - #print(citation.doi_url, "\n") - E.append([pub.doi_url,citation.doi_url]) - #print(pub.doi_url, ".\t", citation.doi_url, "\n") - - else: - E.append([pub.doi_url,citation.doi_url]) # Kante erstellen, wenn citation bekannt, also wenn beide in gleicher Tiefe sind oder bei Zyklus - #print(pub.doi_url, ".\t", citation.doi_url, "\n") - - - #for k in wo_wir_zitiert_wurden: - #if (i != k): - #Knoten k erstellen, wenn noch unbekannt - #Kante erstellen von k nach i - - if (depth < depth_max): - cit_arr = [] - for citation in pub._citations: - if ("acs" in citation.doi_url): - cit_arr.append(citation.doi_url) - process_rec(cit_arr, depth, depth_max) - #else: - #print("--- %s seconds ---" % (time.time() - start_time)) - #process_rec(wo_wir_zitiert_wurden, depth -1)''' - - - # Knoten- und Kantenmenge zurückgeben - # {1,2,3,4,5} oder - # {{1="paper1",0}, {2 = "paper2"},1} oder - # {1="paper1", 2 = "paper2"} - # {(1,2),(2,3),(2,4)} - - -arr = [] -arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249') -arr.append('https://doi.org/10.1021/acs.jmedchem.0c01332') -#arr.append('https://doi.org/10.1021/acs.jcim.0c00741') - -#arr.append('https://doi.org/10.1021/ci700007b') -#arr.append('https://doi.org/10.1021/acs.jcim.5b00292') -#url = sys.argv[1] -#arr.append[url] - -V,E = process_main(arr,2) - -for vortex in V: - print(vortex, "\n") -print("\n") -for edge in E: - print(edge,"\n") \ No newline at end of file diff --git a/verarbeitung/input_fj.py b/verarbeitung/input_fj.py deleted file mode 100644 index 00bb0126e2ae1abf6563bf99a16cc585b6d88077..0000000000000000000000000000000000000000 --- a/verarbeitung/input_fj.py +++ /dev/null @@ -1,144 +0,0 @@ -#!/usr/bin/env python3 -""" -Functions for information retrieval of articles from the ACS journal JCIM - -""" - -__author__ = "Florian Jochens" -__email__ = "fj@andaco.de" -__status__ = "Production" -#__copyright__ = "" -#__credits__ = ["", "", "", ""] -#__license__ = "" -#__version__ = "" -#__maintainer__ = "" - -from bs4 import BeautifulSoup as bs -import requests as req -import sys -from pathlib import Path - -class Publication: - #_registry = [] - _citations = [] - - def __init__(self, title, publication_date, contributors, doi_url, - subjects, num_citations): - #self._registry.append(self) - self.title = title - self.publication_date = publication_date - self.contributors = contributors - self.doi_url = doi_url - self.subjects = subjects - self.num_citations = num_citations - -class Citation: - def __init__(self, title, journal, contributors, doi_url): - self.title = title - self.journal = journal - self.contributors = contributors - self.doi_url = doi_url - -def get_article_info(soup): - header = soup.find('div', class_ = 'article_header-left pull-left') - article_title = header.find('span', class_ = 'hlFld-Title').text - publication_date = header.find('span', class_ = 'pub-date-value').text - for link in header.find('div', class_ = 'article_header-doiurl'): - doi_url = link.get('href') - subs = header.find('div', class_ = 'article_header-taxonomy') - subjects = [] - for sub in subs.find_all('a'): - subjects.append(sub.get('title')) - cons = header.find('ul', class_ = 'loa') - contributors = [] - for con in cons.find_all('span', class_ = 'hlFld-ContribAuthor'): - contributors.append(con.text) - numc = header.find('div', class_ = 'articleMetrics_count') - if not numc.a: - num_citations = 0 - else: - num_citations = numc.a.text - - pub = Publication(article_title, publication_date, contributors, doi_url, - subjects, num_citations) - return pub - -def get_download_url(): - export = soup.find('div', class_ = 'cit-download-dropdown_content') - url = 'https://pubs.acs.org' - for link in export.find_all('a'): - if link.get('title') == 'Citation and references': - url += link.get('href') - print(url) - return url - -def download(url): # Download citation and references file - if url.find('='): - filename = url.rsplit('=', 1)[1] - path = Path(('./files/' + filename)) - if path.is_file(): - print("File already exists") - else: - print("File does not exist") - -def get_citation_info(pub, num_citations, soup): - pub._citations = [] - details = soup.find('ol', class_ = 'cited-content_cbyCitation') - titles = [] - for title in details.find_all('span', - class_ = 'cited-content_cbyCitation_article-title'): - titles.append(title.text.replace('.', '')) - journal_names = [] - for name in details.find_all('span', - class_ = 'cited-content_cbyCitation_journal-name'): - journal_names.append(name.text) - doi_urls = [] - for url in details.find_all('a'): - doi_urls.append(url.get('href')) - contributors = [] - for contrib in details.find_all('span', - class_ = 'cited-content_cbyCitation_article-contributors'): - contributors.append(contrib.text) - for i in range(0, int(num_citations)): - pub._citations.append(Citation(titles[i], journal_names[i], - contributors[i], doi_urls[i])) -def print_pub_info(pub): - print(f'''Article title: {pub.title} -Publication date: {pub.publication_date} -DOI-URL: {pub.doi_url} - -Subjects:''') - print(*(pub.subjects), sep = ", ") - print('\nContributors:') - print(*(pub.contributors), sep = ", ") - - if int(pub.num_citations) > 0: - if int(pub.num_citations) == 1: - print(f'\nThis publication is cited by the following publication:\n') - else: - print(f'\nThis publication is cited by the following {pub.num_citations} publications:\n') - for citation in pub._citations: - print(f''' - Title: {citation.title} - Journal: {citation.journal} - Contributors: {citation.contributors} - DOI-URL: {citation.doi_url} - ''') - else: - print('\nThis publication is not cited by any other publication.') - -def input(url): - html_text = req.get(url).text - soup = bs(html_text, 'html.parser') - - pub = get_article_info(soup) - if int(pub.num_citations) > 0: - get_citation_info(pub, int(pub.num_citations), soup) - return pub - -#if len(sys.argv) != 2: -# sys.stderr.write('Usage: {} <url>\n'.format(sys.argv[0])) -# exit(1) -#url = sys.argv[1] -#pub = input(url) -#print_pub_info(pub) diff --git a/verarbeitung/json_text.txt b/verarbeitung/json_text.txt deleted file mode 100644 index b5e7fa9b225b911bcd2d80c5d74097bdf4d40e1c..0000000000000000000000000000000000000000 --- a/verarbeitung/json_text.txt +++ /dev/null @@ -1 +0,0 @@ -{"nodes": [{"name": "Comparing Molecular Patterns Using the Example of SMARTS: Applications and Filter Collection Analysis", "doi": "https://doi.org/10.1021/acs.jcim.9b00249"}, {"name": "Combining Machine Learning and Computational Chemistry for Predictive Insights Into Chemical Systems ", "doi": "https://doi.org/10.1021/acs.chemrev.1c00107"}, {"name": "Disconnected Maximum Common Substructures under Constraints ", "doi": "https://doi.org/10.1021/acs.jcim.0c00741"}, {"name": "Evolution of Novartis\u2019 Small Molecule Screening Deck Design ", "doi": "https://doi.org/10.1021/acs.jmedchem.0c01332"}, {"name": "Comparing Molecular Patterns Using the Example of SMARTS: Theory and Algorithms ", "doi": "https://doi.org/10.1021/acs.jcim.9b00250"}, {"name": "Machine learning accelerates quantum mechanics predictions of molecular crystals ", "doi": "https://doi.org/10.1016/j.physrep.2021.08.002"}, {"name": "The Growing Importance of Chirality in 3D Chemical Space Exploration and Modern Drug Discovery Approaches for Hit-ID ", "doi": "https://doi.org/10.1021/acsmedchemlett.1c00251"}, {"name": "Target-Based Evaluation of \u201cDrug-Like\u201d Properties and Ligand Efficiencies ", "doi": "https://doi.org/10.1021/acs.jmedchem.1c00416"}, {"name": "BonMOLi\u00e8re: Small-Sized Libraries of Readily Purchasable Compounds, Optimized to Produce Genuine Hits in Biological Screens across the Protein Space ", "doi": "https://doi.org/10.3390/ijms22157773"}, {"name": "Accelerating high-throughput virtual screening through molecular pool-based active learning ", "doi": "https://doi.org/10.1039/D0SC06805E"}, {"name": "Compound Screening ", "doi": "https://doi.org/10.1016/B978-0-12-820472-6.00078-5"}], "links": [{"source": "https://doi.org/10.1021/acs.jcim.9b00249", "target": "https://doi.org/10.1021/acs.chemrev.1c00107"}, {"source": "https://doi.org/10.1021/acs.jcim.9b00249", "target": "https://doi.org/10.1021/acs.jcim.0c00741"}, {"source": "https://doi.org/10.1021/acs.jcim.9b00249", "target": "https://doi.org/10.1021/acs.jmedchem.0c01332"}, {"source": "https://doi.org/10.1021/acs.jcim.9b00249", "target": "https://doi.org/10.1021/acs.jcim.9b00250"}, {"source": "https://doi.org/10.1021/acs.jcim.9b00249", "target": "https://doi.org/10.1016/j.physrep.2021.08.002"}, {"source": "https://doi.org/10.1021/acs.jmedchem.0c01332", "target": "https://doi.org/10.1021/acsmedchemlett.1c00251"}, {"source": "https://doi.org/10.1021/acs.jmedchem.0c01332", "target": "https://doi.org/10.1021/acs.jmedchem.1c00416"}, {"source": "https://doi.org/10.1021/acs.jmedchem.0c01332", "target": "https://doi.org/10.3390/ijms22157773"}, {"source": "https://doi.org/10.1021/acs.jmedchem.0c01332", "target": "https://doi.org/10.1039/D0SC06805E"}, {"source": "https://doi.org/10.1021/acs.jmedchem.0c01332", "target": "https://doi.org/10.1016/B978-0-12-820472-6.00078-5"}]} \ No newline at end of file diff --git a/verarbeitung/json_text_json.txt b/verarbeitung/json_text_json.txt deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/verarbeitung/processing_pseudo.py b/verarbeitung/processing_pseudo.py deleted file mode 100644 index 0d5f5da87e9402fbbdd2dc0294a47363b4aca3ca..0000000000000000000000000000000000000000 --- a/verarbeitung/processing_pseudo.py +++ /dev/null @@ -1,38 +0,0 @@ -# -*- coding: utf-8 -*- -""" -@author: Malte Schokolowski -""" - -def process_main(doi_array, depth): - #ERROR-Handling doi_array = NULL, Tiefe < 0 oder 1 ??? - - # leeres Array für die Knoten wird erstellt - # leeres Array für die Kanten wird erstellt - - # Füge in Knoten-Array alle Starterknoten ein - process_rec(doi_array, depth) - -def process_rec(doi_array, depth): - for i in range(len(doi_array)): - # Input aufrufen und speichern - # Klasseninstanz bestehend aus u.a. - # Name, Autoren, DOI, Jahr, - # was_wir_zitiert_haben, wo_wir_zitiert_wurden - for j in range(len(was_wir_zitiert_haben)): - #Knoten j erstellen, wenn noch unbekannt - #Kante erstellen von i nach j - for k in range(len(wo_wir_zitiert_wurden)): - if (i != k): - #Knoten k erstellen, wenn noch unbekannt - #Kante erstellen von k nach i - process_rec(was_wir_zitiert_haben, depth-1) - process_rec(wo_wir_zitiert_wurden, depth -1) - - - # Knoten- und Kantenmenge zurückgeben - # {1,2,3,4,5} oder - # {{1="paper1",0}, {2 = "paper2"},1} oder - # {1="paper1", 2 = "paper2"} - # {(1,2),(2,3),(2,4)} - - \ No newline at end of file