diff --git a/input/input_fj.py b/input/input_fj.py deleted file mode 100755 index 00bb0126e2ae1abf6563bf99a16cc585b6d88077..0000000000000000000000000000000000000000 --- a/input/input_fj.py +++ /dev/null @@ -1,144 +0,0 @@ -#!/usr/bin/env python3 -""" -Functions for information retrieval of articles from the ACS journal JCIM - -""" - -__author__ = "Florian Jochens" -__email__ = "fj@andaco.de" -__status__ = "Production" -#__copyright__ = "" -#__credits__ = ["", "", "", ""] -#__license__ = "" -#__version__ = "" -#__maintainer__ = "" - -from bs4 import BeautifulSoup as bs -import requests as req -import sys -from pathlib import Path - -class Publication: - #_registry = [] - _citations = [] - - def __init__(self, title, publication_date, contributors, doi_url, - subjects, num_citations): - #self._registry.append(self) - self.title = title - self.publication_date = publication_date - self.contributors = contributors - self.doi_url = doi_url - self.subjects = subjects - self.num_citations = num_citations - -class Citation: - def __init__(self, title, journal, contributors, doi_url): - self.title = title - self.journal = journal - self.contributors = contributors - self.doi_url = doi_url - -def get_article_info(soup): - header = soup.find('div', class_ = 'article_header-left pull-left') - article_title = header.find('span', class_ = 'hlFld-Title').text - publication_date = header.find('span', class_ = 'pub-date-value').text - for link in header.find('div', class_ = 'article_header-doiurl'): - doi_url = link.get('href') - subs = header.find('div', class_ = 'article_header-taxonomy') - subjects = [] - for sub in subs.find_all('a'): - subjects.append(sub.get('title')) - cons = header.find('ul', class_ = 'loa') - contributors = [] - for con in cons.find_all('span', class_ = 'hlFld-ContribAuthor'): - contributors.append(con.text) - numc = header.find('div', class_ = 'articleMetrics_count') - if not numc.a: - num_citations = 0 - else: - num_citations = numc.a.text - - pub = Publication(article_title, publication_date, contributors, doi_url, - subjects, num_citations) - return pub - -def get_download_url(): - export = soup.find('div', class_ = 'cit-download-dropdown_content') - url = 'https://pubs.acs.org' - for link in export.find_all('a'): - if link.get('title') == 'Citation and references': - url += link.get('href') - print(url) - return url - -def download(url): # Download citation and references file - if url.find('='): - filename = url.rsplit('=', 1)[1] - path = Path(('./files/' + filename)) - if path.is_file(): - print("File already exists") - else: - print("File does not exist") - -def get_citation_info(pub, num_citations, soup): - pub._citations = [] - details = soup.find('ol', class_ = 'cited-content_cbyCitation') - titles = [] - for title in details.find_all('span', - class_ = 'cited-content_cbyCitation_article-title'): - titles.append(title.text.replace('.', '')) - journal_names = [] - for name in details.find_all('span', - class_ = 'cited-content_cbyCitation_journal-name'): - journal_names.append(name.text) - doi_urls = [] - for url in details.find_all('a'): - doi_urls.append(url.get('href')) - contributors = [] - for contrib in details.find_all('span', - class_ = 'cited-content_cbyCitation_article-contributors'): - contributors.append(contrib.text) - for i in range(0, int(num_citations)): - pub._citations.append(Citation(titles[i], journal_names[i], - contributors[i], doi_urls[i])) -def print_pub_info(pub): - print(f'''Article title: {pub.title} -Publication date: {pub.publication_date} -DOI-URL: {pub.doi_url} - -Subjects:''') - print(*(pub.subjects), sep = ", ") - print('\nContributors:') - print(*(pub.contributors), sep = ", ") - - if int(pub.num_citations) > 0: - if int(pub.num_citations) == 1: - print(f'\nThis publication is cited by the following publication:\n') - else: - print(f'\nThis publication is cited by the following {pub.num_citations} publications:\n') - for citation in pub._citations: - print(f''' - Title: {citation.title} - Journal: {citation.journal} - Contributors: {citation.contributors} - DOI-URL: {citation.doi_url} - ''') - else: - print('\nThis publication is not cited by any other publication.') - -def input(url): - html_text = req.get(url).text - soup = bs(html_text, 'html.parser') - - pub = get_article_info(soup) - if int(pub.num_citations) > 0: - get_citation_info(pub, int(pub.num_citations), soup) - return pub - -#if len(sys.argv) != 2: -# sys.stderr.write('Usage: {} <url>\n'.format(sys.argv[0])) -# exit(1) -#url = sys.argv[1] -#pub = input(url) -#print_pub_info(pub) diff --git a/verarbeitung/Processing.py b/verarbeitung/Processing.py new file mode 100644 index 0000000000000000000000000000000000000000..bfa533a3161063b7ec82231cbc2c34950336eb11 --- /dev/null +++ b/verarbeitung/Processing.py @@ -0,0 +1,117 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Nov 3 16:54:43 2021 + +@author: Malte Schokolowski +""" + +from bs4 import BeautifulSoup as bs +import requests as req +import sys +from pathlib import Path +from input_fj import input +from json_demo import output_to_json + + + +def process_main(doi_input_array, depth): + # ERROR-Handling doi_array = NULL + if (len(doi_input_array) == 0): + print("Error, no input data") + + # ERROR- wenn für die Tiefe eine negative Zahl eingegeben wird + if (depth < 0): + print("Error, depth of search must be positive") + + + # Leeres Array für die Knoten(nodes) wird erstellt. + # Leeres Array für die Kanten(edges) wird erstellt. + global nodes, edges + nodes = [] + edges = [] + + # Jede Publikation aus dem Input-Array wird in den Knoten-Array(nodes) eingefügt. + for pub_doi in doi_input_array: + pub = input(pub_doi) + not_in_nodes = True + for node in nodes: + if (pub.doi_url == node.doi_url): + not_in_nodes = False + break + if (not_in_nodes): + nodes.append(pub) + else: + doi_input_array.remove(pub_doi) + + process_rec_depth(doi_input_array, 0, depth) + + output_to_json(nodes,edges) + return(nodes,edges) + + +def process_rec_depth(array, depth, depth_max): + # Die Tiefe wird bei jedem rekursiven Aufruf um 1 erhöht. + depth += 1 + + # Für jede Publikation im Input-Array wird ein Klassenobjekt erstellt. + for pub_doi in array: + pub = input(pub_doi) + + # Für jede citation, die in der entsprecheneden Klasseninstanz der Publikation gespeichert sind, + # wird geprüft, ob diese bereits als Knoten existiert. + for citation in pub._citations: + + # Wenn die citation noch nicht im Knoten-Array(nodes) existiert UND die maximale Tiefe + # noch nicht erreicht wurde, wird diese als Knoten im Knoten-Array gespeichert. Zusätzlich + # wird die Verbindung zur Publikation als Tupel im Kanten-Array(edges) gespeichert. + not_in_nodes = True + for node in nodes: + if (citation.doi_url == node.doi_url): + not_in_nodes = False + break + if (not_in_nodes): + if (depth <= depth_max): + nodes.append(citation) + edges.append([pub.doi_url,citation.doi_url]) + + # Wenn die citaion bereits im Knoten-Array existiert, wird nur die Verbindung zur Publikation + # als Tupel im Kanten-Array(edges) gespeichert. + else: + edges.append([pub.doi_url,citation.doi_url]) + + # Wenn die maximale Tiefe noch nicht erreicht wurde, werden alle citations aus der Publikation + # in ein Array geschrieben und mit diesem die Funktion erneut aufgerufen. + if (depth < depth_max): + cit_arr = [] + for citation in pub._citations: + + # Momentan werden nur die citations mit acs in der URL gespeichert, da wir von anderen + # Quellen die Infotmationen nicht extrahieren können. + if ("acs" in citation.doi_url): + cit_arr.append(citation.doi_url) + + # Rekusriver Aufruf der Funktion. + process_rec_depth(cit_arr, depth, depth_max) + + + +# Programmtest, weil noch keine Verbindung zum Input besteht. +arr = [] +arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249') +arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249') +arr.append('https://doi.org/10.1021/acs.jmedchem.0c01332') +#arr.append('https://doi.org/10.1021/acs.jcim.0c00741') + +#arr.append('https://doi.org/10.1021/ci700007b') +#arr.append('https://doi.org/10.1021/acs.jcim.5b00292') +#url = sys.argv[1] +#arr.append[url] + +nodes,edges = process_main(arr,1) + +print("Knoten:\n") +for node in nodes: + print(node.title, "\n") +print("\nKanten:\n") +for edge in edges: + print(edge,"\n") \ No newline at end of file diff --git a/verarbeitung/__pycache__/input_fj.cpython-38.pyc b/verarbeitung/__pycache__/input_fj.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fb7f56fc9742c5d19e2fff3d15c51dc4d59e1b1a Binary files /dev/null and b/verarbeitung/__pycache__/input_fj.cpython-38.pyc differ diff --git a/verarbeitung/__pycache__/json_demo.cpython-38.pyc b/verarbeitung/__pycache__/json_demo.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..519d14366f17ebf12bfdc3f6ae2e985ad9d6d229 Binary files /dev/null and b/verarbeitung/__pycache__/json_demo.cpython-38.pyc differ diff --git a/verarbeitung/json_demo.py b/verarbeitung/json_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..77ce148c8b3898c97472f2c43f1750b99a0ae9a1 --- /dev/null +++ b/verarbeitung/json_demo.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +import json +from input_fj import input + +def output_to_json(V,E): + list_of_node_dicts = list() + list_of_edge_dicts = list() + dict_of_all = dict() + for node in V: + new_dict = dict() + new_dict["name"] = node.title + new_dict["author"] = node.contributors + new_dict["year"] = node.publication_date + new_dict["doi"] = node.doi_url + + + list_of_node_dicts.append(new_dict) + for edge in E: + new_dict_2 = dict() + new_dict_2["source"] = edge[0] + new_dict_2["target"] = edge[1] + list_of_edge_dicts.append(new_dict_2) + dict_of_all["nodes"] = list_of_node_dicts + dict_of_all["links"] = list_of_edge_dicts + #return(dict_of_all) + with open('json_text.txt','w') as outfile: + json.dump(dict_of_all, outfile) + + +#knoten = ["doi1", "doi2", "doi3"] +#kanten = [[1,2],[3,4],[5,6]] +#output_to_json(knoten,kanten) +