Skip to content
Snippets Groups Projects
Commit 35b2ab08 authored by Jochens, Florian's avatar Jochens, Florian
Browse files

updated version of input_fj.py

parent 0325babd
No related branches found
No related tags found
1 merge request!1input
...@@ -15,43 +15,48 @@ __status__ = "Production" ...@@ -15,43 +15,48 @@ __status__ = "Production"
from bs4 import BeautifulSoup as bs from bs4 import BeautifulSoup as bs
import requests as req import requests as req
import sys import sys
from pathlib import Path from pathlib import Path
class Publication: class Publication:
#_registry = [] #_registry = []
_citations = [] _citations = []
def __init__(self, title, publication_date, contributors, doi_url): def __init__(self, title, publication_date, contributors, doi_url,
subjects, num_citations):
#self._registry.append(self) #self._registry.append(self)
self.title = title self.title = title
self.publication_date = publication_date self.publication_date = publication_date
self.contributors = contributors self.contributors = contributors
self.doi_url = doi_url self.doi_url = doi_url
self.subjects = subjects
self.num_citations = num_citations
class Citations: class Citation:
def __init__(self, title, journal, contributors, doi_url): def __init__(self, title, journal, contributors, doi_url):
self.title = title self.title = title
self.journal = journal self.journal = journal
self.contributors = contributors self.contributors = contributors
self.doi_url = doi_url self.doi_url = doi_url
def get_article_info(): def get_article_info(soup):
header = soup.find('div', class_ = 'article_header-left pull-left') header = soup.find('div', class_ = 'article_header-left pull-left')
article_title = header.find('span', class_ = 'hlFld-Title').text article_title = header.find('span', class_ = 'hlFld-Title').text
publication_date = header.find('span', class_ = 'pub-date-value').text publication_date = header.find('span', class_ = 'pub-date-value').text
for link in header.find('div', class_ = 'article_header-doiurl'): for link in header.find('div', class_ = 'article_header-doiurl'):
doi_url = link.get('href') doi_url = link.get('href')
subs = header.find('div', class_ = 'article_header-taxonomy') subs = header.find('div', class_ = 'article_header-taxonomy')
#subjects = [] subjects = []
#for sub in subs.find_all('a'): for sub in subs.find_all('a'):
# subjects.append(sub.get('title')) subjects.append(sub.get('title'))
cons = header.find('ul', class_ = 'loa') cons = header.find('ul', class_ = 'loa')
contributors = [] contributors = []
for con in cons.find_all('span', class_ = 'hlFld-ContribAuthor'): for con in cons.find_all('span', class_ = 'hlFld-ContribAuthor'):
contributors.append(con.text) contributors.append(con.text)
num_citations = header.find('a', class_ = 'internalNav').text
pub = Publication(article_title, publication_date, contributors, doi_url) pub = Publication(article_title, publication_date, contributors, doi_url,
subjects, num_citations)
return pub return pub
def get_download_url(): def get_download_url():
...@@ -60,6 +65,7 @@ def get_download_url(): ...@@ -60,6 +65,7 @@ def get_download_url():
for link in export.find_all('a'): for link in export.find_all('a'):
if link.get('title') == 'Citation and references': if link.get('title') == 'Citation and references':
url += link.get('href') url += link.get('href')
print(url)
return url return url
def download(url): # Download citation and references file def download(url): # Download citation and references file
...@@ -70,27 +76,56 @@ def download(url): # Download citation and references file ...@@ -70,27 +76,56 @@ def download(url): # Download citation and references file
print("File already exists") print("File already exists")
else: else:
print("File does not exist") print("File does not exist")
#def get_citation_info(pub)
# return pub
#def input(): def get_citation_info(pub, num_citations, soup):
details = soup.find('ol', class_ = 'cited-content_cbyCitation')
titles = []
for title in details.find_all('span',
class_ = 'cited-content_cbyCitation_article-title'):
titles.append(title.text.replace('.', ''))
journal_names = []
for name in details.find_all('span',
class_ = 'cited-content_cbyCitation_journal-name'):
journal_names.append(name.text)
doi_urls = []
for url in details.find_all('a'):
doi_urls.append(url.get('href'))
contributors = []
for contrib in details.find_all('span',
class_ = 'cited-content_cbyCitation_article-contributors'):
contributors.append(contrib.text)
for i in range(0, int(num_citations)):
pub._citations.append(Citation(titles[i], journal_names[i],
contributors[i], doi_urls[i]))
def print_pub_info(pub):
print(f'''Article title: {pub.title}
Publication date: {pub.publication_date}
DOI-URL: {pub.doi_url}
Subjects:''')
print(*(pub.subjects), sep = ", ")
print('\nContributors:')
print(*(pub.contributors), sep = ", ")
print(f'\nThis article is cited by the following {pub.num_citations} publications\n')
for citation in pub._citations:
print(f'''
Title: {citation.title}
Journal: {citation.journal}
Contributors: {citation.contributors}
DOI-URL: {citation.doi_url}
''')
def input(url):
html_text = req.get(url).text
soup = bs(html_text, 'html.parser')
pub = get_article_info(soup)
get_citation_info(pub, int(pub.num_citations), soup)
return pub
if len(sys.argv) != 2: if len(sys.argv) != 2:
sys.stderr.write('Usage: {} <url>\n'.format(sys.argv[0])) sys.stderr.write('Usage: {} <url>\n'.format(sys.argv[0]))
exit(1) exit(1)
url = sys.argv[1] url = sys.argv[1]
html_text = req.get(url).text pub = input(url)
soup = bs(html_text, 'html.parser') print_pub_info(pub)
pub = get_article_info()
print(f'''Article title: {pub.title}
Publication date: {pub.publication_date}
Contributors:''')
print(*(pub.contributors), sep = ", ")
print(f'''
DOI-URL: {pub.doi_url}''')
url = get_download_url()
download(url)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment