#!/usr/bin/env python3 """ Functions for information retrieval of articles from the ACS journal JCIM """ __author__ = "Florian Jochens" __email__ = "fj@andaco.de" __status__ = "Production" #__copyright__ = "" #__credits__ = ["", "", "", ""] #__license__ = "" #__version__ = "" #__maintainer__ = "" from bs4 import BeautifulSoup as bs import requests as req import sys from pathlib import Path class Publication: #_registry = [] _citations = [] def __init__(self, title, publication_date, contributors, doi_url, subjects, num_citations): #self._registry.append(self) self.title = title self.publication_date = publication_date self.contributors = contributors self.doi_url = doi_url self.subjects = subjects self.num_citations = num_citations class Citation: def __init__(self, title, journal, contributors, doi_url): self.title = title self.journal = journal self.contributors = contributors self.doi_url = doi_url def get_article_info(soup): header = soup.find('div', class_ = 'article_header-left pull-left') article_title = header.find('span', class_ = 'hlFld-Title').text publication_date = header.find('span', class_ = 'pub-date-value').text for link in header.find('div', class_ = 'article_header-doiurl'): doi_url = link.get('href') subs = header.find('div', class_ = 'article_header-taxonomy') subjects = [] for sub in subs.find_all('a'): subjects.append(sub.get('title')) cons = header.find('ul', class_ = 'loa') contributors = [] for con in cons.find_all('span', class_ = 'hlFld-ContribAuthor'): contributors.append(con.text) numc = header.find('div', class_ = 'articleMetrics_count') if not numc.a: num_citations = 0 else: num_citations = numc.a.text pub = Publication(article_title, publication_date, contributors, doi_url, subjects, num_citations) return pub def get_download_url(): export = soup.find('div', class_ = 'cit-download-dropdown_content') url = 'https://pubs.acs.org' for link in export.find_all('a'): if link.get('title') == 'Citation and references': url += link.get('href') print(url) return url def download(url): # Download citation and references file if url.find('='): filename = url.rsplit('=', 1)[1] path = Path(('./files/' + filename)) if path.is_file(): print("File already exists") else: print("File does not exist") def get_citation_info(pub, num_citations, soup): pub._citations = [] details = soup.find('ol', class_ = 'cited-content_cbyCitation') titles = [] for title in details.find_all('span', class_ = 'cited-content_cbyCitation_article-title'): titles.append(title.text.replace('.', '')) journal_names = [] for name in details.find_all('span', class_ = 'cited-content_cbyCitation_journal-name'): journal_names.append(name.text) doi_urls = [] for url in details.find_all('a'): doi_urls.append(url.get('href')) contributors = [] for contrib in details.find_all('span', class_ = 'cited-content_cbyCitation_article-contributors'): contributors.append(contrib.text) for i in range(0, int(num_citations)): pub._citations.append(Citation(titles[i], journal_names[i], contributors[i], doi_urls[i])) def print_pub_info(pub): print(f'''Article title: {pub.title} Publication date: {pub.publication_date} DOI-URL: {pub.doi_url} Subjects:''') print(*(pub.subjects), sep = ", ") print('\nContributors:') print(*(pub.contributors), sep = ", ") if int(pub.num_citations) > 0: if int(pub.num_citations) == 1: print(f'\nThis publication is cited by the following publication:\n') else: print(f'\nThis publication is cited by the following {pub.num_citations} publications:\n') for citation in pub._citations: print(f''' Title: {citation.title} Journal: {citation.journal} Contributors: {citation.contributors} DOI-URL: {citation.doi_url} ''') else: print('\nThis publication is not cited by any other publication.') def input(url): html_text = req.get(url).text soup = bs(html_text, 'html.parser') pub = get_article_info(soup) if int(pub.num_citations) > 0: get_citation_info(pub, int(pub.num_citations), soup) return pub #if len(sys.argv) != 2: # sys.stderr.write('Usage: {} <url>\n'.format(sys.argv[0])) # exit(1) #url = sys.argv[1] #pub = input(url) #print_pub_info(pub)