added import from json

66157006 · Große, Judith · Malte Schokolowski · f6798fdc · 66157006 · 66157006
Commit 66157006 authored 3 years ago by Große, Judith Committed by Malte Schokolowski 3 years ago
--- a/verarbeitung/Kanten_Vergleich.py
+++ b/verarbeitung/Kanten_Vergleich.py
+#!/usr/bin/env python3
+
+def back_to_valid_edges(Kanten_aus_Json, Geloechte_Knoten):
+    '''
+    :param Kanten_aus_Json: list of edges from the old graph
+    :type Kanten_aus_Json: list
+    :param Geloechte_Knoten: list of deleted nodes from the old graph
+    :type Geloechte_Knoten: list
+    
+    function that deletes edges, if one ore two including nodes are deleted nodes
+    '''
+    list_of_edges_from_json = Kanten_aus_Json
+    list_of_valid_edges = list_of_edges_from_json
+    list_of_deleted_nodes = Geloechte_Knoten
+
+    for deleted_node in list_of_deleted_nodes: #iterates over all deleted nodes
+        for edge in list_of_edges_from_json: #iterates over all edges from old graph
+            for node in edge: #checks for both including nodes if one of them was delted
+                if node == deleted_node: # if one of them is a deleted node
+                    list_of_valid_edges.remove(edge) #removes the edge
+                    break #ist überflüssig, nur fürs verständnis
+        
+    return(list_of_valid_edges)
+
+#Kanten_Menge_Ganz = [["doi_1","doi_2"],["doi_3","doi_4"],["doi_5","doi_6"]]
+#Geloeschte = ["doi_2","doi_1","doi_4"]
+#print(back_to_valid_edges(Kanten_Menge_Ganz,Geloeschte))
+
+#Im Anschluss muss mit den Hinzugefügten Knoten Processing aufgerufen werden
--- a/verarbeitung/Knoten_Vergleich.py
+++ b/verarbeitung/Knoten_Vergleich.py
+#!/usr/bin/env python3
+from collections import Counter
+
+def doi_listen_vergleichen(alte,neue):
+    '''
+    :param alte: list of dois from old graph
+    :type alte: list
+    :param neue: list of dois from new graph
+    :type neue: list
+    
+    function to calculate, which nodes from the old graph are deleted and which are added
+    '''
+    dois_from_old_graph = alte #WICHTIG: Keine doppelten DOIs
+    dois_from_new_graph = neue
+    deleted_nodes = []
+    common_nodes = []
+    inserted_nodes = []
+    all_dois = dois_from_old_graph + dois_from_new_graph
+
+    for doi in all_dois: # iterates over the merged list of new and old dois 
+        if ((Counter(all_dois)[doi]) == 2) & (doi not in common_nodes): # If the doi occurs twice the node is in the old and the new graph
+            common_nodes.append(doi) #appends the doi to common ones, if its not alredy in it
+        elif ((doi in dois_from_old_graph) & (doi not in dois_from_new_graph)): #If the doi occurs once and it is from old graph it is a deleted node
+            deleted_nodes.append(doi) #appends the doi to deleted ones
+        elif ((doi in dois_from_new_graph) & (doi not in dois_from_old_graph)): #if the doi occurs ince and it is from new graph it is a inserted node
+            inserted_nodes.append(doi) #appends the doi to the inserted ones
+    return(common_nodes, inserted_nodes, deleted_nodes)
+    
+
+#Test Prints
+	#liste_1 = ["doi_1","doi_2","doi_3","doi_4","doi_5"]
+	#liste_2 = ["doi_1","doi_2","doi_3","doi_6","doi_7"]
+	#print("gemeinsame Elemente: ",doi_listen_vergleichen(liste_1,liste_2)[0])
+	#print("hinzugefügte Elemente: ",doi_listen_vergleichen(liste_1,liste_2)[1])
+	#print("gelöschte Elemente: ",doi_listen_vergleichen(liste_1,liste_2)[2])
+
--- a/verarbeitung/import_from_json.py
+++ b/verarbeitung/import_from_json.py
+#!/usr/bin/env python3
+import json
+
+class Publication:
+    def __init__(self, doi_url, title, contributors, journal, publication_date, group):
+        self.doi_url = doi_url
+        self.title = title
+        self.contributors = contributors
+        self.journal = journal
+        self.publication_date = publication_date
+        self.group = group
+
+def input_from_json(json_file):
+    '''
+    :param json_file: Json-Datei for the old graph
+    :type json_file: Json File
+    '''
+    with open(json_file,'r') as file: #opens the json file with reading permission
+        python_dict2 = json.load(file) #saves the information in a dictionary
+
+    list_of_nodes_with_all_info = python_dict2["nodes"]
+    list_of_edges_in_json_format = python_dict2["links"]
+    list_of_node_objects = []
+    list_of_edges = []
+
+    for node in list_of_nodes_with_all_info: #iterates over the list of nodes 
+        pub = Publication(node["doi"],node["name"],node["author"],node["journal"],node["year"],   node["group"]) #creates for the nodes the objects class Publication
+        list_of_node_objects.append(pub) #appends the objects to a list
+
+# Es fehlt für Jedes Objekt noch die Liste der References und Citations
+
+# Iteriert über die Liste der Kanten_dictionaries und speichert sie als Liste
+    for edge in list_of_edges_in_json_format: #iterates over the list of edges
+        new_list = [edge["source"],edge["target"]] #converts the edges to other representation
+        list_of_edges.append(new_list) #appends the edges to a list
+
+    return(list_of_node_objects, list_of_edges)
--- a/verarbeitung/input_fj.py
+++ b/verarbeitung/input_fj.py
+#!/usr/bin/env python3
+"""
+Functions for information retrieval of articles from the ACS journal JCIM
+
+"""
+
+__author__ = "Florian Jochens"
+__email__ = "fj@andaco.de"
+__status__ = "Production"
+#__copyright__ = ""
+#__credits__ = ["", "", "", ""]
+#__license__ = ""
+#__version__ = ""
+#__maintainer__ = ""
+
+from bs4 import BeautifulSoup as bs
+import requests as req
+import sys  
+from pathlib import Path
+
+class Publication:
+    #_registry = []
+    _citations = []
+    _references = []
+    
+    def __init__(self, title, publication_date, contributors, doi_url, 
+                 subjects = None, num_citations = None):
+        #self._registry.append(self)
+        self.title = title
+        self.publication_date = publication_date
+        self.contributors = contributors
+        self.doi_url = doi_url
+        self.subjects = subjects
+        self.num_citations = num_citations
+        #self._citations = []
+        #self._references = []
+
+class Citation:
+    def __init__(self, title, journal, contributors, doi_url):
+        self.title = title
+        self.journal = journal
+        self.contributors = contributors
+        self.doi_url = doi_url
+
+class References:
+    def __init__(self, title, journal, contributors, doi_url):
+        self.title = title
+        self.journal = journal
+        self.contributors = contributors
+        self.doi_url = doi_url
+    
+def get_article_info(soup):
+    header = soup.find('div', class_ = 'article_header-left pull-left')
+    article_title = header.find('span', class_ = 'hlFld-Title').text
+    publication_date = header.find('span', class_ = 'pub-date-value').text
+    for link in header.find('div', class_ = 'article_header-doiurl'):
+        doi_url = link.get('href')
+    subs = header.find('div', class_ = 'article_header-taxonomy')
+    subjects = []
+    for sub in subs.find_all('a'):
+        subjects.append(sub.get('title'))
+    cons = header.find('ul', class_ = 'loa')
+    contributors = []
+    for con in cons.find_all('span', class_ = 'hlFld-ContribAuthor'):
+        contributors.append(con.text)
+    numc = header.find('div', class_ = 'articleMetrics_count')
+    if not numc.a:
+        num_citations = 0
+    else:
+        num_citations = numc.a.text
+
+    pub = Publication(article_title, publication_date, contributors, doi_url,
+                      subjects, num_citations)
+    return pub
+
+def get_download_url():
+    export = soup.find('div', class_ = 'cit-download-dropdown_content')
+    url = 'https://pubs.acs.org'
+    for link in export.find_all('a'):
+        if link.get('title') == 'Citation and references':
+            url += link.get('href')     
+    print(url)
+    return url
+
+def download(url): # Download citation and references file
+    if url.find('='):
+        filename = url.rsplit('=', 1)[1]
+    path = Path(('./files/' + filename))
+    if path.is_file():
+        print("File already exists")
+    else:
+        print("File does not exist")
+
+def get_citation_info(pub, num_citations, soup):
+    pub._citations = []
+    details = soup.find('ol', class_ = 'cited-content_cbyCitation')
+    titles = [] 
+    for title in details.find_all('span', 
+            class_ = 'cited-content_cbyCitation_article-title'):
+        titles.append(title.text.replace('.', ''))
+    journal_names = []
+    for name in details.find_all('span',
+            class_ = 'cited-content_cbyCitation_journal-name'):
+        journal_names.append(name.text)
+    doi_urls = []
+    for url in details.find_all('a'):
+        doi_urls.append(url.get('href'))
+    contributors = []
+    for contrib in details.find_all('span', 
+            class_ = 'cited-content_cbyCitation_article-contributors'):
+        contributors.append(contrib.text)
+    for i in range(0, int(num_citations)):
+        pub._citations.append(Citation(titles[i], journal_names[i], 
+                              contributors[i], doi_urls[i]))
+def print_pub_info(pub):
+    print(f'''Article title:    {pub.title}
+Publication date: {pub.publication_date}
+DOI-URL:          {pub.doi_url}
+
+Subjects:''')
+    print(*(pub.subjects), sep = ", ")
+    print('\nContributors:')
+    print(*(pub.contributors), sep = ", ")
+
+    if int(pub.num_citations) > 0:
+        if int(pub.num_citations) == 1:
+            print(f'\nThis publication is cited by the following publication:\n')
+        else:
+            print(f'\nThis publication is cited by the following {pub.num_citations} publications:\n')
+        for citation in pub._citations:
+            print(f'''
+    Title:        {citation.title}
+    Journal:      {citation.journal}
+    Contributors: {citation.contributors}
+    DOI-URL:      {citation.doi_url}
+            ''')
+    else:
+        print('\nThis publication is not cited by any other publication.')
+
+def input(url):
+    html_text = req.get(url).text
+    soup = bs(html_text, 'html.parser')
+    
+    pub = get_article_info(soup)
+    if int(pub.num_citations) > 0:
+        get_citation_info(pub, int(pub.num_citations), soup)
+    return pub
+
+#if len(sys.argv) != 2:
+#    sys.stderr.write('Usage: {} <url>\n'.format(sys.argv[0]))
+#    exit(1)
+#url = sys.argv[1]
+#pub = input(url)
+#print_pub_info(pub)