diff --git a/verarbeitung/Kanten_Vergleich.py b/verarbeitung/Kanten_Vergleich.py new file mode 100644 index 0000000000000000000000000000000000000000..5e1cd41747f08a1cda10d9f735956809120b5139 --- /dev/null +++ b/verarbeitung/Kanten_Vergleich.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 + +def back_to_valid_edges(Kanten_aus_Json, Geloechte_Knoten): + ''' + :param Kanten_aus_Json: list of edges from the old graph + :type Kanten_aus_Json: list + :param Geloechte_Knoten: list of deleted nodes from the old graph + :type Geloechte_Knoten: list + + function that deletes edges, if one ore two including nodes are deleted nodes + ''' + list_of_edges_from_json = Kanten_aus_Json + list_of_valid_edges = list_of_edges_from_json + list_of_deleted_nodes = Geloechte_Knoten + + for deleted_node in list_of_deleted_nodes: #iterates over all deleted nodes + for edge in list_of_edges_from_json: #iterates over all edges from old graph + for node in edge: #checks for both including nodes if one of them was delted + if node == deleted_node: # if one of them is a deleted node + list_of_valid_edges.remove(edge) #removes the edge + break #ist überflüssig, nur fürs verständnis + + return(list_of_valid_edges) + +#Kanten_Menge_Ganz = [["doi_1","doi_2"],["doi_3","doi_4"],["doi_5","doi_6"]] +#Geloeschte = ["doi_2","doi_1","doi_4"] +#print(back_to_valid_edges(Kanten_Menge_Ganz,Geloeschte)) + +#Im Anschluss muss mit den Hinzugefügten Knoten Processing aufgerufen werden diff --git a/verarbeitung/Knoten_Vergleich.py b/verarbeitung/Knoten_Vergleich.py new file mode 100644 index 0000000000000000000000000000000000000000..37fc1671ebeae942e088508f88dabf30087d5cf5 --- /dev/null +++ b/verarbeitung/Knoten_Vergleich.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 +from collections import Counter + +def doi_listen_vergleichen(alte,neue): + ''' + :param alte: list of dois from old graph + :type alte: list + :param neue: list of dois from new graph + :type neue: list + + function to calculate, which nodes from the old graph are deleted and which are added + ''' + dois_from_old_graph = alte #WICHTIG: Keine doppelten DOIs + dois_from_new_graph = neue + deleted_nodes = [] + common_nodes = [] + inserted_nodes = [] + all_dois = dois_from_old_graph + dois_from_new_graph + + for doi in all_dois: # iterates over the merged list of new and old dois + if ((Counter(all_dois)[doi]) == 2) & (doi not in common_nodes): # If the doi occurs twice the node is in the old and the new graph + common_nodes.append(doi) #appends the doi to common ones, if its not alredy in it + elif ((doi in dois_from_old_graph) & (doi not in dois_from_new_graph)): #If the doi occurs once and it is from old graph it is a deleted node + deleted_nodes.append(doi) #appends the doi to deleted ones + elif ((doi in dois_from_new_graph) & (doi not in dois_from_old_graph)): #if the doi occurs ince and it is from new graph it is a inserted node + inserted_nodes.append(doi) #appends the doi to the inserted ones + return(common_nodes, inserted_nodes, deleted_nodes) + + +#Test Prints + #liste_1 = ["doi_1","doi_2","doi_3","doi_4","doi_5"] + #liste_2 = ["doi_1","doi_2","doi_3","doi_6","doi_7"] + #print("gemeinsame Elemente: ",doi_listen_vergleichen(liste_1,liste_2)[0]) + #print("hinzugefügte Elemente: ",doi_listen_vergleichen(liste_1,liste_2)[1]) + #print("gelöschte Elemente: ",doi_listen_vergleichen(liste_1,liste_2)[2]) + diff --git a/verarbeitung/import_from_json.py b/verarbeitung/import_from_json.py new file mode 100644 index 0000000000000000000000000000000000000000..c318e601977cd6f7b1b8e8550024db6b0c35f616 --- /dev/null +++ b/verarbeitung/import_from_json.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +import json + +class Publication: + def __init__(self, doi_url, title, contributors, journal, publication_date, group): + self.doi_url = doi_url + self.title = title + self.contributors = contributors + self.journal = journal + self.publication_date = publication_date + self.group = group + +def input_from_json(json_file): + ''' + :param json_file: Json-Datei for the old graph + :type json_file: Json File + ''' + with open(json_file,'r') as file: #opens the json file with reading permission + python_dict2 = json.load(file) #saves the information in a dictionary + + list_of_nodes_with_all_info = python_dict2["nodes"] + list_of_edges_in_json_format = python_dict2["links"] + list_of_node_objects = [] + list_of_edges = [] + + for node in list_of_nodes_with_all_info: #iterates over the list of nodes + pub = Publication(node["doi"],node["name"],node["author"],node["journal"],node["year"], node["group"]) #creates for the nodes the objects class Publication + list_of_node_objects.append(pub) #appends the objects to a list + +# Es fehlt für Jedes Objekt noch die Liste der References und Citations + +# Iteriert über die Liste der Kanten_dictionaries und speichert sie als Liste + for edge in list_of_edges_in_json_format: #iterates over the list of edges + new_list = [edge["source"],edge["target"]] #converts the edges to other representation + list_of_edges.append(new_list) #appends the edges to a list + + return(list_of_node_objects, list_of_edges) diff --git a/verarbeitung/input_fj.py b/verarbeitung/input_fj.py new file mode 100644 index 0000000000000000000000000000000000000000..ecc8e68fc5a84a446ae3f09dcb5ed56e8d262766 --- /dev/null +++ b/verarbeitung/input_fj.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +""" +Functions for information retrieval of articles from the ACS journal JCIM + +""" + +__author__ = "Florian Jochens" +__email__ = "fj@andaco.de" +__status__ = "Production" +#__copyright__ = "" +#__credits__ = ["", "", "", ""] +#__license__ = "" +#__version__ = "" +#__maintainer__ = "" + +from bs4 import BeautifulSoup as bs +import requests as req +import sys +from pathlib import Path + +class Publication: + #_registry = [] + _citations = [] + _references = [] + + def __init__(self, title, publication_date, contributors, doi_url, + subjects = None, num_citations = None): + #self._registry.append(self) + self.title = title + self.publication_date = publication_date + self.contributors = contributors + self.doi_url = doi_url + self.subjects = subjects + self.num_citations = num_citations + #self._citations = [] + #self._references = [] + +class Citation: + def __init__(self, title, journal, contributors, doi_url): + self.title = title + self.journal = journal + self.contributors = contributors + self.doi_url = doi_url + +class References: + def __init__(self, title, journal, contributors, doi_url): + self.title = title + self.journal = journal + self.contributors = contributors + self.doi_url = doi_url + +def get_article_info(soup): + header = soup.find('div', class_ = 'article_header-left pull-left') + article_title = header.find('span', class_ = 'hlFld-Title').text + publication_date = header.find('span', class_ = 'pub-date-value').text + for link in header.find('div', class_ = 'article_header-doiurl'): + doi_url = link.get('href') + subs = header.find('div', class_ = 'article_header-taxonomy') + subjects = [] + for sub in subs.find_all('a'): + subjects.append(sub.get('title')) + cons = header.find('ul', class_ = 'loa') + contributors = [] + for con in cons.find_all('span', class_ = 'hlFld-ContribAuthor'): + contributors.append(con.text) + numc = header.find('div', class_ = 'articleMetrics_count') + if not numc.a: + num_citations = 0 + else: + num_citations = numc.a.text + + pub = Publication(article_title, publication_date, contributors, doi_url, + subjects, num_citations) + return pub + +def get_download_url(): + export = soup.find('div', class_ = 'cit-download-dropdown_content') + url = 'https://pubs.acs.org' + for link in export.find_all('a'): + if link.get('title') == 'Citation and references': + url += link.get('href') + print(url) + return url + +def download(url): # Download citation and references file + if url.find('='): + filename = url.rsplit('=', 1)[1] + path = Path(('./files/' + filename)) + if path.is_file(): + print("File already exists") + else: + print("File does not exist") + +def get_citation_info(pub, num_citations, soup): + pub._citations = [] + details = soup.find('ol', class_ = 'cited-content_cbyCitation') + titles = [] + for title in details.find_all('span', + class_ = 'cited-content_cbyCitation_article-title'): + titles.append(title.text.replace('.', '')) + journal_names = [] + for name in details.find_all('span', + class_ = 'cited-content_cbyCitation_journal-name'): + journal_names.append(name.text) + doi_urls = [] + for url in details.find_all('a'): + doi_urls.append(url.get('href')) + contributors = [] + for contrib in details.find_all('span', + class_ = 'cited-content_cbyCitation_article-contributors'): + contributors.append(contrib.text) + for i in range(0, int(num_citations)): + pub._citations.append(Citation(titles[i], journal_names[i], + contributors[i], doi_urls[i])) +def print_pub_info(pub): + print(f'''Article title: {pub.title} +Publication date: {pub.publication_date} +DOI-URL: {pub.doi_url} + +Subjects:''') + print(*(pub.subjects), sep = ", ") + print('\nContributors:') + print(*(pub.contributors), sep = ", ") + + if int(pub.num_citations) > 0: + if int(pub.num_citations) == 1: + print(f'\nThis publication is cited by the following publication:\n') + else: + print(f'\nThis publication is cited by the following {pub.num_citations} publications:\n') + for citation in pub._citations: + print(f''' + Title: {citation.title} + Journal: {citation.journal} + Contributors: {citation.contributors} + DOI-URL: {citation.doi_url} + ''') + else: + print('\nThis publication is not cited by any other publication.') + +def input(url): + html_text = req.get(url).text + soup = bs(html_text, 'html.parser') + + pub = get_article_info(soup) + if int(pub.num_citations) > 0: + get_citation_info(pub, int(pub.num_citations), soup) + return pub + +#if len(sys.argv) != 2: +# sys.stderr.write('Usage: {} <url>\n'.format(sys.argv[0])) +# exit(1) +#url = sys.argv[1] +#pub = input(url) +#print_pub_info(pub)