Skip to content
Snippets Groups Projects
Commit be58b180 authored by Florian Jochens's avatar Florian Jochens
Browse files

added newest version of the input files

parent 1ea2c94e
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3 #!/usr/bin/env python3
from input.interface import InputInterface from input.interface import InputInterface as Input
import input.publication
def main(url: str): def main(url: str):
#print(get_publication(url)) print(Input.get_publication(url))
print(InputInterface.get_publication(url))
#pub.print_pub()
if __name__ == "__main__": if __name__ == "__main__":
#main("https://doi.org/10.1021/acs.jcim.1c00203") #main("https://doi.org/10.1021/acs.jcim.1c00203")
#main("https://doi.org/10.1021/acs.jcim.1c00917") main("https://pubs.acs.org/doi/10.1021/acs.jcim.5b00332")
main("https://doi.org/10.1021/acs.jcim.5b00332")
File deleted
...@@ -5,12 +5,12 @@ Input-Package to fetch publication information with a given url. ...@@ -5,12 +5,12 @@ Input-Package to fetch publication information with a given url.
## Usage/Examples ## Usage/Examples
```python ```python
from input.interface import get_publication from input.interface import InputInterface as Input
from input.publication import Publication from input.publication import Publication
def main(url): def main(url):
try: try:
pub = get_publication(url) pub = Input.get_publication(url)
except Exception as error: except Exception as error:
raise error raise error
...@@ -21,6 +21,14 @@ def main(url): ...@@ -21,6 +21,14 @@ def main(url):
if __name__=="__main__": if __name__=="__main__":
main("https://doi.org/10.1021/acs.chemrev.8b00728") main("https://doi.org/10.1021/acs.chemrev.8b00728")
``` ```
## Testing
``` c
python -m unittest input/test/<file.py> -v
# for all tests in directory
python -m unittest discover input/test -v
```
## Authors ## Authors
- Florian Jochens - Florian Jochens
- Sam Ockenden - Sam Ockenden
......
...@@ -21,13 +21,16 @@ class Fetcher(JournalFetcher): ...@@ -21,13 +21,16 @@ class Fetcher(JournalFetcher):
SUPPORTED_JOURNALS = ['1021'] SUPPORTED_JOURNALS = ['1021']
@staticmethod @staticmethod
def can_use_url(url: str) -> bool: def can_use_url(url: str) -> str:
""" """
Uses Regex to extract journal specific substrings in Doi. Uses Regex to extract journal specific substrings in Doi.
TODO: Support non Doi-urls TODO: Support non Doi-urls
""" """
matched_url = re.match(r'(https?://)?(doi.org/)?(10.(\d{4})/\w+.\S+)', url) matched_url = re.match(r'^(https?://)?(doi.org/|pubs.acs.org/doi/)?(10.(\d{4})/\w+.\S+)', url.strip(". \t\r\n"))
if matched_url is not None:
return matched_url[4] in Fetcher.SUPPORTED_JOURNALS return matched_url[4] in Fetcher.SUPPORTED_JOURNALS
else:
return False
@staticmethod @staticmethod
def get_publication(url: str) -> Publication: def get_publication(url: str) -> Publication:
...@@ -65,16 +68,19 @@ class Fetcher(JournalFetcher): ...@@ -65,16 +68,19 @@ class Fetcher(JournalFetcher):
references_soup = ref_cit_soup.select('ol#references') references_soup = ref_cit_soup.select('ol#references')
if references_soup != []: if references_soup != []:
for reference in references_soup[0].select('li'): for reference in references_soup[0].select('li'):
ref_doi = "https://doi.org/{}".format(reference.select('.refDoi')[0].text.strip()[5:])\ if reference.select('.refDoi') != []:
if reference.select('.refDoi') != [] else "None" ref_doi = "https://doi.org/{}".format(reference.select('.refDoi')[0].text.strip()[5:])
else:
# No Doi -> No Paper
continue
ref_title = reference.select('.NLM_article-title')[0].text\ ref_title = reference.select('.NLM_article-title')[0].text\
if reference.select('.NLM_article-title') != [] else "None" if reference.select('.NLM_article-title') != [] else None
ref_journal = reference.select('i')[0].text\ ref_journal = reference.select('i')[0].text\
if reference.select('i') != [] else "None" if reference.select('i') != [] else None
ref_contributors=[] ref_contributors=[]
for author in reference.select('.NLM_contrib-group'): for author in reference.select('.NLM_contrib-group'):
ref_contributors.append(author.text) ref_contributors.append(author.text.replace("\n", " ").replace("\r", ""))
references.append(Citation(ref_doi, ref_title, ref_journal, ref_contributors, cit_type="Reference")) references.append(Citation(ref_doi, ref_title, ref_journal, ref_contributors, cit_type="Reference"))
...@@ -82,14 +88,18 @@ class Fetcher(JournalFetcher): ...@@ -82,14 +88,18 @@ class Fetcher(JournalFetcher):
citation_soup = ref_cit_soup.select('.cited-content_cbyCitation') citation_soup = ref_cit_soup.select('.cited-content_cbyCitation')
if citation_soup != []: if citation_soup != []:
for citation in citation_soup[0].select('li'): for citation in citation_soup[0].select('li'):
cit_doi = citation.select('a[title="DOI URL"]')[0].text\ if citation.select('a[title="DOI URL"]') != []:
if citation.select('a[title="DOI URL"]') != [] else "None" cit_doi = citation.select('a[title="DOI URL"]')[0].text
else:
# No Doi -> No Paper
continue
cit_title = citation.select('.cited-content_cbyCitation_article-title')[0].text\ cit_title = citation.select('.cited-content_cbyCitation_article-title')[0].text\
if citation.select('.cited-content_cbyCitation_article-title')!= [] else "None" if citation.select('.cited-content_cbyCitation_article-title')!= [] else "None"
cit_journal = citation.select('.cited-content_cbyCitation_journal-name')[0].text\ cit_journal = citation.select('.cited-content_cbyCitation_journal-name')[0].text\
if citation.select('.cited-content_cbyCitation_journal-name') != [] else "None" if citation.select('.cited-content_cbyCitation_journal-name') != [] else "None"
cit_contributors =[] cit_contributors =[]
cit_contributors = citation.select('.cited-content_cbyCitation_article-contributors')[0].text.split(', ') cit_contributors = citation.select('.cited-content_cbyCitation_article-contributors')[0]\
.text.replace("\n", " ").replace("\r", "").split(', ')
# clean up of the last Entry # clean up of the last Entry
cit_contributors_last = cit_contributors.pop().strip(". ") cit_contributors_last = cit_contributors.pop().strip(". ")
if cit_contributors_last != '': if cit_contributors_last != '':
...@@ -98,8 +108,3 @@ class Fetcher(JournalFetcher): ...@@ -98,8 +108,3 @@ class Fetcher(JournalFetcher):
return Publication(doi_url, title, contributors, journal, published return Publication(doi_url, title, contributors, journal, published
, subjects, num_citations, references, citations) , subjects, num_citations, references, citations)
@staticmethod
def test_fetcher():
pass
#!/usr/bin/env python3
"""
Child class of JournalFetcher
JCIM
"""
# import re
from input.get.journal_fetcher import JournalFetcher
from input.publication import Publication, Citation, Reference
import requests as req
from bs4 import BeautifulSoup as bs
class Fetcher(JournalFetcher):
"""
"""
# TODO: Naming-Convention:
# Class: 'Fetcher'
# file: input_get_[journal-/organisation-name]
# format = "input_get_[a-z]*.py" allowed
# TODO: List of Compatable Journals
_SUPPORTED_JOURNALS = []
@staticmethod
def can_use_url(url: str) -> bool:
"""
Checks if given url links to a supported journal.
"""
# TODO: Check the URL for compatability
# re.match in _SUPPORTED_JOURNALS
return True
@staticmethod
def get_publication(url: str) -> Publication:
return input(url)
@staticmethod
def test_fetcher():
pass
def get_article_info(soup):
header = soup.find('div', class_ = 'article_header-left pull-left')
article_title = header.find('span', class_ = 'hlFld-Title').text
publication_date = header.find('span', class_ = 'pub-date-value').text
for link in header.find('div', class_ = 'article_header-doiurl'):
doi_url = link.get('href')
subs = header.find('div', class_ = 'article_header-taxonomy')
subjects = []
for sub in subs.find_all('a'):
subjects.append(sub.get('title'))
cons = header.find('ul', class_ = 'loa')
contributors = []
for con in cons.find_all('span', class_ = 'hlFld-ContribAuthor'):
contributors.append(con.text)
numc = header.find('div', class_ = 'articleMetrics_count')
if not numc.a:
num_citations = 0
else:
num_citations = numc.a.text
pub = Publication(doi_url, article_title, contributors, "JCIM",
publication_date, subjects, num_citations)
#pub = Publication(article_title, publication_date, contributors, doi_url,
# subjects, num_citations)
return pub
def get_citation_info(pub, num_citations, soup):
details = soup.find('ol', class_ = 'cited-content_cbyCitation')
titles = []
for title in details.find_all('span',
class_ = 'cited-content_cbyCitation_article-title'):
titles.append(title.text.replace('.', ''))
journal_names = []
for name in details.find_all('span',
class_ = 'cited-content_cbyCitation_journal-name'):
journal_names.append(name.text)
doi_urls = []
for url in details.find_all('a'):
doi_urls.append(url.get('href'))
# TODO: There are a few diffrent types how Contributors are listed
contributors = []
for contrib in details.find_all('span',
class_ = 'cited-content_cbyCitation_article-contributors'):
contributors.append(contrib.text)
for i in range(0, int(num_citations)):
pub.citations.append(Citation(doi_urls[i], titles[i], journal_names[i], \
contributors[i]))
def input(url):
html_text = req.get(url).text
soup = bs(html_text, 'html.parser')
pub = get_article_info(soup)
if int(pub.num_citations) > 0:
get_citation_info(pub, int(pub.num_citations), soup)
return pub
...@@ -14,6 +14,7 @@ class JournalFetcher(metaclass=ABCMeta): ...@@ -14,6 +14,7 @@ class JournalFetcher(metaclass=ABCMeta):
""" """
This is a abstract-class for fetcher modules This is a abstract-class for fetcher modules
""" """
@staticmethod @staticmethod
def get_soup(url: str) -> BeautifulSoup: def get_soup(url: str) -> BeautifulSoup:
""" """
...@@ -32,6 +33,7 @@ class JournalFetcher(metaclass=ABCMeta): ...@@ -32,6 +33,7 @@ class JournalFetcher(metaclass=ABCMeta):
return BeautifulSoup(req.content, 'html.parser') return BeautifulSoup(req.content, 'html.parser')
@staticmethod @staticmethod
@abstractmethod @abstractmethod
def can_use_url(url: str) -> bool: def can_use_url(url: str) -> bool:
...@@ -41,6 +43,7 @@ class JournalFetcher(metaclass=ABCMeta): ...@@ -41,6 +43,7 @@ class JournalFetcher(metaclass=ABCMeta):
""" """
raise AttributeError("JournalFetcher for '{}' hasnt implemented 'can_use_url()'".format(url)) raise AttributeError("JournalFetcher for '{}' hasnt implemented 'can_use_url()'".format(url))
@staticmethod @staticmethod
@abstractmethod @abstractmethod
def get_publication(url: str) -> Publication: def get_publication(url: str) -> Publication:
...@@ -49,12 +52,3 @@ class JournalFetcher(metaclass=ABCMeta): ...@@ -49,12 +52,3 @@ class JournalFetcher(metaclass=ABCMeta):
Creates a Publication-instance. Creates a Publication-instance.
""" """
raise AttributeError("JournalFetcher for '{}' hasnt implemented 'get_publication()'".format(url)) raise AttributeError("JournalFetcher for '{}' hasnt implemented 'get_publication()'".format(url))
@staticmethod
@abstractmethod
def test_fetcher():
"""
Abstract-function to be implemented in subclass.
Unit-test for the class.
"""
raise AttributeError("JournalFetcher: Subclass hasnt implemented 'test_fetcher()'")
...@@ -57,7 +57,3 @@ class Fetcher(JournalFetcher): ...@@ -57,7 +57,3 @@ class Fetcher(JournalFetcher):
# TODO: Exceptions-handling # TODO: Exceptions-handling
# raise ValueException("Cant Fetch: '{}'".format(error)) # raise ValueException("Cant Fetch: '{}'".format(error))
# return None # return None
@staticmethod
def test_fetcher():
pass
...@@ -31,7 +31,9 @@ class Fetcher(JournalFetcher): ...@@ -31,7 +31,9 @@ class Fetcher(JournalFetcher):
# TODO: Check the URL for compatability # TODO: Check the URL for compatability
# url_re = re.match(r'(https?://)?(doi.org/)?(10.(\d{4})/\w+.\S+)', url) # url_re = re.match(r'(https?://)?(doi.org/)?(10.(\d{4})/\w+.\S+)', url)
# if url_re is not None:
# return url_re[4] in SUPPORTED_JOURNALS # return url_re[4] in SUPPORTED_JOURNALS
# else:
return False return False
@staticmethod @staticmethod
...@@ -46,7 +48,3 @@ class Fetcher(JournalFetcher): ...@@ -46,7 +48,3 @@ class Fetcher(JournalFetcher):
# TODO: Create new Publication-instance # TODO: Create new Publication-instance
# return Publication(doi,title,contributors[],journal,publication_date,subjects[],num_citation=None ,references[],citations[]) # return Publication(doi,title,contributors[],journal,publication_date,subjects[],num_citation=None ,references[],citations[])
return None return None
\ No newline at end of file
@staticmethod
def test_fetcher():
pass
...@@ -4,10 +4,10 @@ class Publication: ...@@ -4,10 +4,10 @@ class Publication:
""" """
Represents a Publications Represents a Publications
""" """
def __init__(self, doi_url: str, title: str def __init__(self, doi_url: str, title: str \
, contributors: str, journal: str , contributors: list[str], journal: str \
, publication_date: str, subjects = None, num_citations = None , publication_date: str, subjects: list[str], num_citations: int = None \
, references = None, citations = None ): , references: list[any] = None, citations: list[any] = None ):
""" """
Parameters Parameters
---------- ----------
...@@ -72,7 +72,7 @@ class Publication: ...@@ -72,7 +72,7 @@ class Publication:
citation_string.append(citation.__str__()) citation_string.append(citation.__str__())
return citation_string return citation_string
def citations(self, citation) -> None: def add_citations(self, citation) -> None:
""" """
Appends a list of Citations or Citation to self.citations. Appends a list of Citations or Citation to self.citations.
...@@ -132,9 +132,9 @@ Subjects:''') ...@@ -132,9 +132,9 @@ Subjects:''')
class Citation: class Citation:
def __init__(self, doi_url: str, title: str def __init__(self, doi_url: str, title: str \
, journal: str, contributors = None , journal: str, contributors: list[str] \
, cit_type = "Citation"): , cit_type: str = "Citation"):
""" """
Parameters Parameters
---------- ----------
...@@ -168,7 +168,7 @@ class Citation: ...@@ -168,7 +168,7 @@ class Citation:
# This is just a replica of Citations # This is just a replica of Citations
class Reference: class Reference:
def __init__(self, doi_url, title, journal, contributors): def __init__(self, doi_url: str, title: str, journal: str, contributors: list[str]):
self.title = title self.title = title
self.doi_url = doi_url self.doi_url = doi_url
self.journal = journal self.journal = journal
......
#!/usr/bin/env python3
"""
Functions for information retrieval of articles from the ACS journal JCIM
"""
__author__ = "Florian Jochens"
__email__ = "fj@andaco.de"
__status__ = "Production"
#__copyright__ = ""
#__credits__ = ["", "", "", ""]
#__license__ = ""
#__version__ = ""
#__maintainer__ = ""
from bs4 import BeautifulSoup as bs
import requests as req
import sys
from pathlib import Path
class Publication:
#_registry = []
_citations = []
def __init__(self, title, publication_date, contributors, doi_url,
subjects, num_citations):
#self._registry.append(self)
self.title = title
self.publication_date = publication_date
self.contributors = contributors
self.doi_url = doi_url
self.subjects = subjects
self.num_citations = num_citations
class Citation:
def __init__(self, title, journal, contributors, doi_url):
self.title = title
self.journal = journal
self.contributors = contributors
self.doi_url = doi_url
def get_article_info(soup):
header = soup.find('div', class_ = 'article_header-left pull-left')
article_title = header.find('span', class_ = 'hlFld-Title').text
publication_date = header.find('span', class_ = 'pub-date-value').text
for link in header.find('div', class_ = 'article_header-doiurl'):
doi_url = link.get('href')
subs = header.find('div', class_ = 'article_header-taxonomy')
subjects = []
for sub in subs.find_all('a'):
subjects.append(sub.get('title'))
cons = header.find('ul', class_ = 'loa')
contributors = []
for con in cons.find_all('span', class_ = 'hlFld-ContribAuthor'):
contributors.append(con.text)
numc = header.find('div', class_ = 'articleMetrics_count')
if not numc.a:
num_citations = 0
else:
num_citations = numc.a.text
pub = Publication(article_title, publication_date, contributors, doi_url,
subjects, num_citations)
return pub
def get_download_url():
export = soup.find('div', class_ = 'cit-download-dropdown_content')
url = 'https://pubs.acs.org'
for link in export.find_all('a'):
if link.get('title') == 'Citation and references':
url += link.get('href')
print(url)
return url
def download(url): # Download citation and references file
if url.find('='):
filename = url.rsplit('=', 1)[1]
path = Path(('./files/' + filename))
if path.is_file():
print("File already exists")
else:
print("File does not exist")
def get_citation_info(pub, num_citations, soup):
pub._citations = []
details = soup.find('ol', class_ = 'cited-content_cbyCitation')
titles = []
for title in details.find_all('span',
class_ = 'cited-content_cbyCitation_article-title'):
titles.append(title.text.replace('.', ''))
journal_names = []
for name in details.find_all('span',
class_ = 'cited-content_cbyCitation_journal-name'):
journal_names.append(name.text)
doi_urls = []
for url in details.find_all('a'):
doi_urls.append(url.get('href'))
contributors = []
for contrib in details.find_all('span',
class_ = 'cited-content_cbyCitation_article-contributors'):
contributors.append(contrib.text)
for i in range(0, int(num_citations)):
pub._citations.append(Citation(titles[i], journal_names[i],
contributors[i], doi_urls[i]))
def print_pub_info(pub):
print(f'''Article title: {pub.title}
Publication date: {pub.publication_date}
DOI-URL: {pub.doi_url}
Subjects:''')
print(*(pub.subjects), sep = ", ")
print('\nContributors:')
print(*(pub.contributors), sep = ", ")
if int(pub.num_citations) > 0:
if int(pub.num_citations) == 1:
print(f'\nThis publication is cited by the following publication:\n')
else:
print(f'\nThis publication is cited by the following {pub.num_citations} publications:\n')
for citation in pub._citations:
print(f'''
Title: {citation.title}
Journal: {citation.journal}
Contributors: {citation.contributors}
DOI-URL: {citation.doi_url}
''')
else:
print('\nThis publication is not cited by any other publication.')
def input(url):
html_text = req.get(url).text
soup = bs(html_text, 'html.parser')
pub = get_article_info(soup)
if int(pub.num_citations) > 0:
get_citation_info(pub, int(pub.num_citations), soup)
return pub
#if len(sys.argv) != 2:
# sys.stderr.write('Usage: {} <url>\n'.format(sys.argv[0]))
# exit(1)
#url = sys.argv[1]
#pub = input(url)
#print_pub_info(pub)
class Publication:
#_registry = []
#_citations = []
#_references = []
def __init__(self, title, publication_date, contributors, doi_url,
subjects, num_citations):
#self._registry.append(self)
self.title = title
self.publication_date = publication_date
self.contributors = contributors
self.doi_url = doi_url
self.subjects = subjects
self.num_citations = num_citations
self.num_references = num_references
self._citations = []
self._references = []
class Citation:
def __init__(self, title, journal, contributors, doi_url):
self.title = title
self.journal = journal
self.contributors = contributors
self.doi_url = doi_url
class References:
def __init__(self, title, journal, contributors, doi_url):
self.title = title
self.journal = journal
self.contributors = contributors
self.doi_url = doi_url
#!/usr/bin/env python3
from input_fj import input, print_pub_info
import sys
if len(sys.argv) != 3:
sys.stderr.write('Usage: {} <url>\n'.format(sys.argv[0]))
exit(1)
url = sys.argv[1]
url2 = sys.argv[2]
pub = input(url)
print_pub_info(pub)
pub2 = input(url2)
print_pub_info(pub2)
import unittest
"""
Testing the Publication fetcher
Publication 1: 'https://doi.org/10.1021/acs.jcim.1c00203'
Publication 2: 'doi.org/10.1021/acs.jcim.1c00917'
Publication 3: '10.1038/nchem.1781'
Publication 4: '11.12/jaj'
Publication 5: '11.12/'
Publication 6: 'https://doi.org/10.1021/acs.jmedchem.0c01332' # Paper is a PDF
"""
class TestGetPublication(unittest.TestCase):
def test_publication1(self):
pass
def test_publication2(self):
pass
def test_publication3(self):
pass
if __name__=="__main__":
print("test")
unittest.main()
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment