Newer
Older
Functions for information retrievel of articles from the ACS journal JCIM
"""
__author__ = "Florian Jochens"
__email__ = "fj@andaco.de"
__status__ = "Production"
#__copyright__ = ""
#__credits__ = ["", "", "", ""]
#__license__ = ""
#__version__ = ""
#__maintainer__ = ""
from bs4 import BeautifulSoup as bs
import requests as req
import sys
from pathlib import Path
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
class Publication:
#_registry = []
_citations = []
def __init__(self, title, publication_date, contributors, doi_url):
#self._registry.append(self)
self.title = title
self.publication_date = publication_date
self.contributors = contributors
self.doi_url = doi_url
class Citations:
def __init__(self, title, journal, contributors, doi_url):
self.title = title
self.journal = journal
self.contributors = contributors
self.doi_url = doi_url
def get_article_info():
header = soup.find('div', class_ = 'article_header-left pull-left')
article_title = header.find('span', class_ = 'hlFld-Title').text
publication_date = header.find('span', class_ = 'pub-date-value').text
for link in header.find('div', class_ = 'article_header-doiurl'):
doi_url = link.get('href')
subs = header.find('div', class_ = 'article_header-taxonomy')
#subjects = []
#for sub in subs.find_all('a'):
# subjects.append(sub.get('title'))
cons = header.find('ul', class_ = 'loa')
contributors = []
for con in cons.find_all('span', class_ = 'hlFld-ContribAuthor'):
contributors.append(con.text)
pub = Publication(article_title, publication_date, contributors, doi_url)
return pub
def get_download_url():
export = soup.find('div', class_ = 'cit-download-dropdown_content')
url = 'https://pubs.acs.org'
for link in export.find_all('a'):
if link.get('title') == 'Citation and references':
url += link.get('href')
return url
def download(url): # Download citation and references file
if url.find('='):
filename = url.rsplit('=', 1)[1]
path = Path(('./files/' + filename))
if path.is_file():
print("File already exists")
else:
print("File does not exist")
#def get_citation_info(pub)
if len(sys.argv) != 2:
sys.stderr.write('Usage: {} <url>\n'.format(sys.argv[0]))
exit(1)
url = sys.argv[1]
html_text = req.get(url).text
soup = bs(html_text, 'html.parser')
pub = get_article_info()
print(f'''Article title: {pub.title}
Publication date: {pub.publication_date}
Contributors:''')
print(*(pub.contributors), sep = ", ")
print(f'''
DOI-URL: {pub.doi_url}''')
url = get_download_url()
download(url)