#!/usr/bin/env python3 """ Functions for information retrievel of articles from the ACS journal JCIM """ __author__ = "Florian Jochens" __email__ = "fj@andaco.de" __status__ = "Production" #__copyright__ = "" #__credits__ = ["", "", "", ""] #__license__ = "" #__version__ = "" #__maintainer__ = "" from bs4 import BeautifulSoup as bs import requests as req import sys from pathlib import Path class Publication: #_registry = [] _citations = [] def __init__(self, title, publication_date, contributors, doi_url): #self._registry.append(self) self.title = title self.publication_date = publication_date self.contributors = contributors self.doi_url = doi_url class Citations: def __init__(self, title, journal, contributors, doi_url): self.title = title self.journal = journal self.contributors = contributors self.doi_url = doi_url def get_article_info(): header = soup.find('div', class_ = 'article_header-left pull-left') article_title = header.find('span', class_ = 'hlFld-Title').text publication_date = header.find('span', class_ = 'pub-date-value').text for link in header.find('div', class_ = 'article_header-doiurl'): doi_url = link.get('href') subs = header.find('div', class_ = 'article_header-taxonomy') #subjects = [] #for sub in subs.find_all('a'): # subjects.append(sub.get('title')) cons = header.find('ul', class_ = 'loa') contributors = [] for con in cons.find_all('span', class_ = 'hlFld-ContribAuthor'): contributors.append(con.text) pub = Publication(article_title, publication_date, contributors, doi_url) return pub def get_download_url(): export = soup.find('div', class_ = 'cit-download-dropdown_content') url = 'https://pubs.acs.org' for link in export.find_all('a'): if link.get('title') == 'Citation and references': url += link.get('href') return url def download(url): # Download citation and references file if url.find('='): filename = url.rsplit('=', 1)[1] path = Path(('./files/' + filename)) if path.is_file(): print("File already exists") else: print("File does not exist") #def get_citation_info(pub) # return pub #def input(): if len(sys.argv) != 2: sys.stderr.write('Usage: {} <url>\n'.format(sys.argv[0])) exit(1) url = sys.argv[1] html_text = req.get(url).text soup = bs(html_text, 'html.parser') pub = get_article_info() print(f'''Article title: {pub.title} Publication date: {pub.publication_date} Contributors:''') print(*(pub.contributors), sep = ", ") print(f''' DOI-URL: {pub.doi_url}''') url = get_download_url() download(url)