diff --git a/input/README.md b/input/README.md new file mode 100644 index 0000000000000000000000000000000000000000..76bd11d5d70daac13e190f4d52269eb381413c69 --- /dev/null +++ b/input/README.md @@ -0,0 +1,3 @@ +# Projekt CiS-Projekt 2021/22 +Input-Skripts + diff --git a/input/input_fj.py b/input/input_fj.py new file mode 100755 index 0000000000000000000000000000000000000000..444c8c25ef05b82cebf31d903df0dc5d7afa3440 --- /dev/null +++ b/input/input_fj.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 +""" +Functions for information retrieval of articles from the ACS journal JCIM + +""" + +__author__ = "Florian Jochens" +__email__ = "fj@andaco.de" +__status__ = "Production" +#__copyright__ = "" +#__credits__ = ["", "", "", ""] +#__license__ = "" +#__version__ = "" +#__maintainer__ = "" + +from bs4 import BeautifulSoup as bs +import requests as req +import sys +from pathlib import Path + +class Publication: + #_registry = [] + _citations = [] + + def __init__(self, title, publication_date, contributors, doi_url, + subjects, num_citations): + #self._registry.append(self) + self.title = title + self.publication_date = publication_date + self.contributors = contributors + self.doi_url = doi_url + self.subjects = subjects + self.num_citations = num_citations + +class Citation: + def __init__(self, title, journal, contributors, doi_url): + self.title = title + self.journal = journal + self.contributors = contributors + self.doi_url = doi_url + +def get_article_info(soup): + header = soup.find('div', class_ = 'article_header-left pull-left') + article_title = header.find('span', class_ = 'hlFld-Title').text + publication_date = header.find('span', class_ = 'pub-date-value').text + for link in header.find('div', class_ = 'article_header-doiurl'): + doi_url = link.get('href') + subs = header.find('div', class_ = 'article_header-taxonomy') + subjects = [] + for sub in subs.find_all('a'): + subjects.append(sub.get('title')) + cons = header.find('ul', class_ = 'loa') + contributors = [] + for con in cons.find_all('span', class_ = 'hlFld-ContribAuthor'): + contributors.append(con.text) + numc = header.find('div', class_ = 'articleMetrics_count') + if not numc.a: + num_citations = 0 + else: + num_citations = numc.a.text + + pub = Publication(article_title, publication_date, contributors, doi_url, + subjects, num_citations) + return pub + +def get_download_url(): + export = soup.find('div', class_ = 'cit-download-dropdown_content') + url = 'https://pubs.acs.org' + for link in export.find_all('a'): + if link.get('title') == 'Citation and references': + url += link.get('href') + print(url) + return url + +def download(url): # Download citation and references file + if url.find('='): + filename = url.rsplit('=', 1)[1] + path = Path(('./files/' + filename)) + if path.is_file(): + print("File already exists") + else: + print("File does not exist") + +def get_citation_info(pub, num_citations, soup): + details = soup.find('ol', class_ = 'cited-content_cbyCitation') + titles = [] + for title in details.find_all('span', + class_ = 'cited-content_cbyCitation_article-title'): + titles.append(title.text.replace('.', '')) + journal_names = [] + for name in details.find_all('span', + class_ = 'cited-content_cbyCitation_journal-name'): + journal_names.append(name.text) + doi_urls = [] + for url in details.find_all('a'): + doi_urls.append(url.get('href')) + contributors = [] + for contrib in details.find_all('span', + class_ = 'cited-content_cbyCitation_article-contributors'): + contributors.append(contrib.text) + for i in range(0, int(num_citations)): + pub._citations.append(Citation(titles[i], journal_names[i], + contributors[i], doi_urls[i])) +def print_pub_info(pub): + print(f'''Article title: {pub.title} +Publication date: {pub.publication_date} +DOI-URL: {pub.doi_url} + +Subjects:''') + print(*(pub.subjects), sep = ", ") + print('\nContributors:') + print(*(pub.contributors), sep = ", ") + + if int(pub.num_citations) > 0: + if int(pub.num_citations) == 1: + print(f'\nThis publication is cited by the following publication:\n') + else: + print(f'\nThis publication is cited by the following {pub.num_citations} publications:\n') + for citation in pub._citations: + print(f''' + Title: {citation.title} + Journal: {citation.journal} + Contributors: {citation.contributors} + DOI-URL: {citation.doi_url} + ''') + else: + print('\nThis publication is not cited by any other publication.') + +def input(url): + html_text = req.get(url).text + soup = bs(html_text, 'html.parser') + + pub = get_article_info(soup) + if int(pub.num_citations) > 0: + get_citation_info(pub, int(pub.num_citations), soup) + return pub + +#if len(sys.argv) != 2: +# sys.stderr.write('Usage: {} <url>\n'.format(sys.argv[0])) +# exit(1) +#url = sys.argv[1] +#pub = input(url) +#print_pub_info(pub)