Main

7b8730bc · Große, Judith · Ockenden, Samuel · b227a3d8 · b227a3d8 · 7b8730bc
Commit 7b8730bc authored 3 years ago by Große, Judith Committed by Ockenden, Samuel 3 years ago
--- a/input/input_fj.py
+++ b/input/input_fj.py
-#!/usr/bin/env python3
-"""
-Functions for information retrieval of articles from the ACS journal JCIM
-"""
-__author__ = "Florian Jochens"
-__email__ = "fj@andaco.de"
-__status__ = "Production"
-#__copyright__ = ""
-#__credits__ = ["", "", "", ""]
-#__license__ = ""
-#__version__ = ""
-#__maintainer__ = ""
-from bs4 import BeautifulSoup as bs
-import requests as req
-import sys  
-from pathlib import Path
-class Publication:
-    #_registry = []
-    _citations = []
-    def __init__(self, title, publication_date, contributors, doi_url, 
-                 subjects, num_citations):
-        #self._registry.append(self)
-        self.title = title
-        self.publication_date = publication_date
-        self.contributors = contributors
-        self.doi_url = doi_url
-        self.subjects = subjects
-        self.num_citations = num_citations
-class Citation:
-    def __init__(self, title, journal, contributors, doi_url):
-        self.title = title
-        self.journal = journal
-        self.contributors = contributors
-        self.doi_url = doi_url
-def get_article_info(soup):
-    header = soup.find('div', class_ = 'article_header-left pull-left')
-    article_title = header.find('span', class_ = 'hlFld-Title').text
-    publication_date = header.find('span', class_ = 'pub-date-value').text
-    for link in header.find('div', class_ = 'article_header-doiurl'):
-        doi_url = link.get('href')
-    subs = header.find('div', class_ = 'article_header-taxonomy')
-    subjects = []
-    for sub in subs.find_all('a'):
-        subjects.append(sub.get('title'))
-    cons = header.find('ul', class_ = 'loa')
-    contributors = []
-    for con in cons.find_all('span', class_ = 'hlFld-ContribAuthor'):
-        contributors.append(con.text)
-    numc = header.find('div', class_ = 'articleMetrics_count')
-    if not numc.a:
-        num_citations = 0
-    else:
-        num_citations = numc.a.text
-    pub = Publication(article_title, publication_date, contributors, doi_url,
-                      subjects, num_citations)
-    return pub
-def get_download_url():
-    export = soup.find('div', class_ = 'cit-download-dropdown_content')
-    url = 'https://pubs.acs.org'
-    for link in export.find_all('a'):
-        if link.get('title') == 'Citation and references':
-            url += link.get('href')     
-    print(url)
-    return url
-def download(url): # Download citation and references file
-    if url.find('='):
-        filename = url.rsplit('=', 1)[1]
-    path = Path(('./files/' + filename))
-    if path.is_file():
-        print("File already exists")
-    else:
-        print("File does not exist")
-def get_citation_info(pub, num_citations, soup):
-    pub._citations = []
-    details = soup.find('ol', class_ = 'cited-content_cbyCitation')
-    titles = [] 
-    for title in details.find_all('span', 
-            class_ = 'cited-content_cbyCitation_article-title'):
-        titles.append(title.text.replace('.', ''))
-    journal_names = []
-    for name in details.find_all('span',
-            class_ = 'cited-content_cbyCitation_journal-name'):
-        journal_names.append(name.text)
-    doi_urls = []
-    for url in details.find_all('a'):
-        doi_urls.append(url.get('href'))
-    contributors = []
-    for contrib in details.find_all('span', 
-            class_ = 'cited-content_cbyCitation_article-contributors'):
-        contributors.append(contrib.text)
-    for i in range(0, int(num_citations)):
-        pub._citations.append(Citation(titles[i], journal_names[i], 
-                              contributors[i], doi_urls[i]))
-def print_pub_info(pub):
-    print(f'''Article title:    {pub.title}
-Publication date: {pub.publication_date}
-DOI-URL:          {pub.doi_url}
-Subjects:''')
-    print(*(pub.subjects), sep = ", ")
-    print('\nContributors:')
-    print(*(pub.contributors), sep = ", ")
-    if int(pub.num_citations) > 0:
-        if int(pub.num_citations) == 1:
-            print(f'\nThis publication is cited by the following publication:\n')
-        else:
-            print(f'\nThis publication is cited by the following {pub.num_citations} publications:\n')
-        for citation in pub._citations:
-            print(f'''
-    Title:        {citation.title}
-    Journal:      {citation.journal}
-    Contributors: {citation.contributors}
-    DOI-URL:      {citation.doi_url}
-            ''')
-    else:
-        print('\nThis publication is not cited by any other publication.')
-def input(url):
-    html_text = req.get(url).text
-    soup = bs(html_text, 'html.parser')
-    pub = get_article_info(soup)
-    if int(pub.num_citations) > 0:
-        get_citation_info(pub, int(pub.num_citations), soup)
-    return pub
-#if len(sys.argv) != 2:
-#    sys.stderr.write('Usage: {} <url>\n'.format(sys.argv[0]))
-#    exit(1)
-#url = sys.argv[1]
-#pub = input(url)
-#print_pub_info(pub)
--- a/verarbeitung/Processing.py
+++ b/verarbeitung/Processing.py
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Nov  3 16:54:43 2021
+@author: Malte Schokolowski
+"""
+from bs4 import BeautifulSoup as bs
+import requests as req
+import sys  
+from pathlib import Path
+from input_fj import input
+from json_demo import output_to_json
+def process_main(doi_input_array, depth):
+    # ERROR-Handling doi_array = NULL
+    if (len(doi_input_array) == 0):
+        print("Error, no input data")
+    # ERROR- wenn für die Tiefe eine negative Zahl eingegeben wird
+    if (depth < 0):
+        print("Error, depth of search must be positive")
+    # Leeres Array für die Knoten(nodes) wird erstellt.
+    # Leeres Array für die Kanten(edges) wird erstellt.
+    global nodes, edges
+    nodes = []
+    edges = []
+    # Jede Publikation aus dem Input-Array wird in den Knoten-Array(nodes) eingefügt.
+    for pub_doi in doi_input_array:
+        pub = input(pub_doi)
+        not_in_nodes = True
+        for node in nodes:
+            if (pub.doi_url == node.doi_url):
+                not_in_nodes = False
+                break
+        if (not_in_nodes):
+            nodes.append(pub)
+        else:
+            doi_input_array.remove(pub_doi)
+    process_rec_depth(doi_input_array, 0, depth)
+    output_to_json(nodes,edges)
+    return(nodes,edges)
+def process_rec_depth(array, depth, depth_max):  
+    # Die Tiefe wird bei jedem rekursiven Aufruf um 1 erhöht.
+    depth += 1
+    # Für jede Publikation im Input-Array wird ein Klassenobjekt erstellt.
+    for pub_doi in array:
+        pub = input(pub_doi)
+        # Für jede citation, die in der entsprecheneden Klasseninstanz der Publikation gespeichert sind, 
+        # wird geprüft, ob diese bereits als Knoten existiert.
+        for citation in pub._citations:
+            # Wenn die citation noch nicht im Knoten-Array(nodes) existiert UND die maximale Tiefe 
+            # noch nicht erreicht wurde, wird diese als Knoten im Knoten-Array gespeichert. Zusätzlich 
+            # wird die Verbindung zur Publikation als Tupel im Kanten-Array(edges) gespeichert. 
+            not_in_nodes = True
+            for node in nodes:
+                if (citation.doi_url == node.doi_url):
+                    not_in_nodes = False
+                    break
+            if (not_in_nodes):
+                if (depth <= depth_max):
+                    nodes.append(citation)
+                    edges.append([pub.doi_url,citation.doi_url])
+            # Wenn die citaion bereits im Knoten-Array existiert, wird nur die Verbindung zur Publikation 
+            # als Tupel im Kanten-Array(edges) gespeichert.            
+            else:
+                edges.append([pub.doi_url,citation.doi_url])
+        # Wenn die maximale Tiefe noch nicht erreicht wurde, werden alle citations aus der Publikation 
+        # in ein Array geschrieben und mit diesem die Funktion erneut aufgerufen.      
+        if (depth < depth_max):
+            cit_arr = []
+            for citation in pub._citations:
+                # Momentan werden nur die citations mit acs in der URL gespeichert, da wir von anderen 
+                # Quellen die Infotmationen nicht extrahieren können.
+                if ("acs" in citation.doi_url):
+                    cit_arr.append(citation.doi_url)
+            # Rekusriver Aufruf der Funktion.
+            process_rec_depth(cit_arr, depth, depth_max)
+# Programmtest, weil noch keine Verbindung zum Input besteht.
+arr = []
+arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249')
+arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249')
+arr.append('https://doi.org/10.1021/acs.jmedchem.0c01332')
+#arr.append('https://doi.org/10.1021/acs.jcim.0c00741')
+#arr.append('https://doi.org/10.1021/ci700007b')
+#arr.append('https://doi.org/10.1021/acs.jcim.5b00292')
+#url = sys.argv[1]
+#arr.append[url]
+nodes,edges = process_main(arr,1)
+print("Knoten:\n")
+for node in nodes:
+    print(node.title, "\n")
+print("\nKanten:\n")
+for edge in edges:
+    print(edge,"\n")
\ No newline at end of file
--- a/verarbeitung/__pycache__/input_fj.cpython-38.pyc
+++ b/verarbeitung/__pycache__/input_fj.cpython-38.pyc
--- a/verarbeitung/__pycache__/json_demo.cpython-38.pyc
+++ b/verarbeitung/__pycache__/json_demo.cpython-38.pyc
--- a/verarbeitung/json_demo.py
+++ b/verarbeitung/json_demo.py
+#!/usr/bin/env python3
+import json
+from input_fj import input
+def output_to_json(V,E):
+    list_of_node_dicts = list()
+    list_of_edge_dicts = list()
+    dict_of_all = dict()
+    for node in V:
+        new_dict = dict()
+        new_dict["name"] = node.title
+        new_dict["author"] = node.contributors
+        new_dict["year"] = node.publication_date
+        new_dict["doi"] = node.doi_url
+        list_of_node_dicts.append(new_dict)
+    for edge in E:
+        new_dict_2 = dict()
+        new_dict_2["source"] = edge[0]
+        new_dict_2["target"] = edge[1]
+        list_of_edge_dicts.append(new_dict_2)
+    dict_of_all["nodes"] = list_of_node_dicts
+    dict_of_all["links"] = list_of_edge_dicts
+    #return(dict_of_all)
+    with open('json_text.txt','w') as outfile:
+        json.dump(dict_of_all, outfile)
+#knoten = ["doi1", "doi2", "doi3"]
+#kanten = [[1,2],[3,4],[5,6]]
+#output_to_json(knoten,kanten)