diff --git a/input/input_fj.py b/input/input_fj.py index 25d06b848ba42ebb5ca7f00908e022173cd5c16d..00566e005c94e75bda5244c8e5e7653974b90bd6 100755 --- a/input/input_fj.py +++ b/input/input_fj.py @@ -15,43 +15,48 @@ __status__ = "Production" from bs4 import BeautifulSoup as bs import requests as req -import sys +import sys from pathlib import Path class Publication: #_registry = [] _citations = [] - def __init__(self, title, publication_date, contributors, doi_url): + def __init__(self, title, publication_date, contributors, doi_url, + subjects, num_citations): #self._registry.append(self) self.title = title self.publication_date = publication_date self.contributors = contributors self.doi_url = doi_url + self.subjects = subjects + self.num_citations = num_citations -class Citations: +class Citation: def __init__(self, title, journal, contributors, doi_url): self.title = title self.journal = journal self.contributors = contributors self.doi_url = doi_url -def get_article_info(): +def get_article_info(soup): header = soup.find('div', class_ = 'article_header-left pull-left') article_title = header.find('span', class_ = 'hlFld-Title').text publication_date = header.find('span', class_ = 'pub-date-value').text for link in header.find('div', class_ = 'article_header-doiurl'): doi_url = link.get('href') subs = header.find('div', class_ = 'article_header-taxonomy') - #subjects = [] - #for sub in subs.find_all('a'): - # subjects.append(sub.get('title')) + subjects = [] + for sub in subs.find_all('a'): + subjects.append(sub.get('title')) cons = header.find('ul', class_ = 'loa') contributors = [] for con in cons.find_all('span', class_ = 'hlFld-ContribAuthor'): contributors.append(con.text) + num_citations = header.find('a', class_ = 'internalNav').text - pub = Publication(article_title, publication_date, contributors, doi_url) + pub = Publication(article_title, publication_date, contributors, doi_url, + subjects, num_citations) return pub def get_download_url(): @@ -60,6 +65,7 @@ def get_download_url(): for link in export.find_all('a'): if link.get('title') == 'Citation and references': url += link.get('href') + print(url) return url def download(url): # Download citation and references file @@ -70,27 +76,56 @@ def download(url): # Download citation and references file print("File already exists") else: print("File does not exist") -#def get_citation_info(pub) - -# return pub -#def input(): +def get_citation_info(pub, num_citations, soup): + details = soup.find('ol', class_ = 'cited-content_cbyCitation') + titles = [] + for title in details.find_all('span', + class_ = 'cited-content_cbyCitation_article-title'): + titles.append(title.text.replace('.', '')) + journal_names = [] + for name in details.find_all('span', + class_ = 'cited-content_cbyCitation_journal-name'): + journal_names.append(name.text) + doi_urls = [] + for url in details.find_all('a'): + doi_urls.append(url.get('href')) + contributors = [] + for contrib in details.find_all('span', + class_ = 'cited-content_cbyCitation_article-contributors'): + contributors.append(contrib.text) + for i in range(0, int(num_citations)): + pub._citations.append(Citation(titles[i], journal_names[i], + contributors[i], doi_urls[i])) +def print_pub_info(pub): + print(f'''Article title: {pub.title} +Publication date: {pub.publication_date} +DOI-URL: {pub.doi_url} + +Subjects:''') + print(*(pub.subjects), sep = ", ") + print('\nContributors:') + print(*(pub.contributors), sep = ", ") + print(f'\nThis article is cited by the following {pub.num_citations} publications\n') + for citation in pub._citations: + print(f''' + Title: {citation.title} + Journal: {citation.journal} + Contributors: {citation.contributors} + DOI-URL: {citation.doi_url} + ''') + +def input(url): + html_text = req.get(url).text + soup = bs(html_text, 'html.parser') + + pub = get_article_info(soup) + get_citation_info(pub, int(pub.num_citations), soup) + return pub if len(sys.argv) != 2: sys.stderr.write('Usage: {} <url>\n'.format(sys.argv[0])) exit(1) url = sys.argv[1] -html_text = req.get(url).text -soup = bs(html_text, 'html.parser') -pub = get_article_info() - -print(f'''Article title: {pub.title} -Publication date: {pub.publication_date} - -Contributors:''') -print(*(pub.contributors), sep = ", ") -print(f''' -DOI-URL: {pub.doi_url}''') - -url = get_download_url() -download(url) +pub = input(url) +print_pub_info(pub)