Newer
Older
Functions for information retrievel of articles from the ACS journal JCIM
"""
__author__ = "Florian Jochens"
__email__ = "fj@andaco.de"
__status__ = "Production"
#__copyright__ = ""
#__credits__ = ["", "", "", ""]
#__license__ = ""
#__version__ = ""
#__maintainer__ = ""
from bs4 import BeautifulSoup as bs
import requests as req
from pathlib import Path
class Publication:
#_registry = []
_citations = []
def __init__(self, title, publication_date, contributors, doi_url,
subjects, num_citations):
#self._registry.append(self)
self.title = title
self.publication_date = publication_date
self.contributors = contributors
self.doi_url = doi_url
self.subjects = subjects
self.num_citations = num_citations
def __init__(self, title, journal, contributors, doi_url):
self.title = title
self.journal = journal
self.contributors = contributors
self.doi_url = doi_url
header = soup.find('div', class_ = 'article_header-left pull-left')
article_title = header.find('span', class_ = 'hlFld-Title').text
publication_date = header.find('span', class_ = 'pub-date-value').text
for link in header.find('div', class_ = 'article_header-doiurl'):
doi_url = link.get('href')
subs = header.find('div', class_ = 'article_header-taxonomy')
subjects = []
for sub in subs.find_all('a'):
subjects.append(sub.get('title'))
cons = header.find('ul', class_ = 'loa')
contributors = []
for con in cons.find_all('span', class_ = 'hlFld-ContribAuthor'):
contributors.append(con.text)
num_citations = header.find('a', class_ = 'internalNav').text
pub = Publication(article_title, publication_date, contributors, doi_url,
subjects, num_citations)
def get_download_url():
export = soup.find('div', class_ = 'cit-download-dropdown_content')
url = 'https://pubs.acs.org'
for link in export.find_all('a'):
if link.get('title') == 'Citation and references':
url += link.get('href')
return url
def download(url): # Download citation and references file
if url.find('='):
filename = url.rsplit('=', 1)[1]
path = Path(('./files/' + filename))
if path.is_file():
print("File already exists")
else:
print("File does not exist")
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
def get_citation_info(pub, num_citations, soup):
details = soup.find('ol', class_ = 'cited-content_cbyCitation')
titles = []
for title in details.find_all('span',
class_ = 'cited-content_cbyCitation_article-title'):
titles.append(title.text.replace('.', ''))
journal_names = []
for name in details.find_all('span',
class_ = 'cited-content_cbyCitation_journal-name'):
journal_names.append(name.text)
doi_urls = []
for url in details.find_all('a'):
doi_urls.append(url.get('href'))
contributors = []
for contrib in details.find_all('span',
class_ = 'cited-content_cbyCitation_article-contributors'):
contributors.append(contrib.text)
for i in range(0, int(num_citations)):
pub._citations.append(Citation(titles[i], journal_names[i],
contributors[i], doi_urls[i]))
def print_pub_info(pub):
print(f'''Article title: {pub.title}
Publication date: {pub.publication_date}
DOI-URL: {pub.doi_url}
Subjects:''')
print(*(pub.subjects), sep = ", ")
print('\nContributors:')
print(*(pub.contributors), sep = ", ")
print(f'\nThis article is cited by the following {pub.num_citations} publications\n')
for citation in pub._citations:
print(f'''
Title: {citation.title}
Journal: {citation.journal}
Contributors: {citation.contributors}
DOI-URL: {citation.doi_url}
''')
def input(url):
html_text = req.get(url).text
soup = bs(html_text, 'html.parser')
pub = get_article_info(soup)
get_citation_info(pub, int(pub.num_citations), soup)
return pub
if len(sys.argv) != 2:
sys.stderr.write('Usage: {} <url>\n'.format(sys.argv[0]))
exit(1)
url = sys.argv[1]
pub = input(url)
print_pub_info(pub)