diff --git a/example_input.py b/example_input.py index a8331cb93912fb21a1d184ab97beda8996413c74..febbc7b05ced9c093adc0fe413fbd2d5e3094e1e 100755 --- a/example_input.py +++ b/example_input.py @@ -1,14 +1,10 @@ #!/usr/bin/env python3 -from input.interface import InputInterface -import input.publication +from input.interface import InputInterface as Input def main(url: str): - #print(get_publication(url)) - print(InputInterface.get_publication(url)) - #pub.print_pub() + print(Input.get_publication(url)) if __name__ == "__main__": #main("https://doi.org/10.1021/acs.jcim.1c00203") - #main("https://doi.org/10.1021/acs.jcim.1c00917") - main("https://doi.org/10.1021/acs.jcim.5b00332") + main("https://pubs.acs.org/doi/10.1021/acs.jcim.5b00332") diff --git a/input/.publication.py.swp b/input/.publication.py.swp deleted file mode 100644 index d2a0a137478b10579a081fbe5111873dda6877d0..0000000000000000000000000000000000000000 Binary files a/input/.publication.py.swp and /dev/null differ diff --git a/input/README.md b/input/README.md index 0ebd7e1c0a72a1cd23caf15f227e2b8c186a75b4..7776ee0c3e01c24b0fc99c9025783dbc1d02c563 100644 --- a/input/README.md +++ b/input/README.md @@ -5,12 +5,12 @@ Input-Package to fetch publication information with a given url. ## Usage/Examples ```python -from input.interface import get_publication +from input.interface import InputInterface as Input from input.publication import Publication def main(url): try: - pub = get_publication(url) + pub = Input.get_publication(url) except Exception as error: raise error @@ -21,6 +21,14 @@ def main(url): if __name__=="__main__": main("https://doi.org/10.1021/acs.chemrev.8b00728") ``` + +## Testing + +``` c +python -m unittest input/test/<file.py> -v +# for all tests in directory +python -m unittest discover input/test -v +``` ## Authors - Florian Jochens - Sam Ockenden diff --git a/input/get/acs.py b/input/get/acs.py index 3a54a1512944196285651315fc47a821a463e2d2..cfc1d6db0372c4267fc971a4f3dd0064ea5a63fa 100755 --- a/input/get/acs.py +++ b/input/get/acs.py @@ -21,13 +21,16 @@ class Fetcher(JournalFetcher): SUPPORTED_JOURNALS = ['1021'] @staticmethod - def can_use_url(url: str) -> bool: + def can_use_url(url: str) -> str: """ Uses Regex to extract journal specific substrings in Doi. TODO: Support non Doi-urls """ - matched_url = re.match(r'(https?://)?(doi.org/)?(10.(\d{4})/\w+.\S+)', url) - return matched_url[4] in Fetcher.SUPPORTED_JOURNALS + matched_url = re.match(r'^(https?://)?(doi.org/|pubs.acs.org/doi/)?(10.(\d{4})/\w+.\S+)', url.strip(". \t\r\n")) + if matched_url is not None: + return matched_url[4] in Fetcher.SUPPORTED_JOURNALS + else: + return False @staticmethod def get_publication(url: str) -> Publication: @@ -65,16 +68,19 @@ class Fetcher(JournalFetcher): references_soup = ref_cit_soup.select('ol#references') if references_soup != []: for reference in references_soup[0].select('li'): - ref_doi = "https://doi.org/{}".format(reference.select('.refDoi')[0].text.strip()[5:])\ - if reference.select('.refDoi') != [] else "None" + if reference.select('.refDoi') != []: + ref_doi = "https://doi.org/{}".format(reference.select('.refDoi')[0].text.strip()[5:]) + else: + # No Doi -> No Paper + continue ref_title = reference.select('.NLM_article-title')[0].text\ - if reference.select('.NLM_article-title') != [] else "None" + if reference.select('.NLM_article-title') != [] else None ref_journal = reference.select('i')[0].text\ - if reference.select('i') != [] else "None" + if reference.select('i') != [] else None ref_contributors=[] for author in reference.select('.NLM_contrib-group'): - ref_contributors.append(author.text) + ref_contributors.append(author.text.replace("\n", " ").replace("\r", "")) references.append(Citation(ref_doi, ref_title, ref_journal, ref_contributors, cit_type="Reference")) @@ -82,14 +88,18 @@ class Fetcher(JournalFetcher): citation_soup = ref_cit_soup.select('.cited-content_cbyCitation') if citation_soup != []: for citation in citation_soup[0].select('li'): - cit_doi = citation.select('a[title="DOI URL"]')[0].text\ - if citation.select('a[title="DOI URL"]') != [] else "None" + if citation.select('a[title="DOI URL"]') != []: + cit_doi = citation.select('a[title="DOI URL"]')[0].text + else: + # No Doi -> No Paper + continue cit_title = citation.select('.cited-content_cbyCitation_article-title')[0].text\ if citation.select('.cited-content_cbyCitation_article-title')!= [] else "None" cit_journal = citation.select('.cited-content_cbyCitation_journal-name')[0].text\ if citation.select('.cited-content_cbyCitation_journal-name') != [] else "None" cit_contributors =[] - cit_contributors = citation.select('.cited-content_cbyCitation_article-contributors')[0].text.split(', ') + cit_contributors = citation.select('.cited-content_cbyCitation_article-contributors')[0]\ + .text.replace("\n", " ").replace("\r", "").split(', ') # clean up of the last Entry cit_contributors_last = cit_contributors.pop().strip(". ") if cit_contributors_last != '': @@ -98,8 +108,3 @@ class Fetcher(JournalFetcher): return Publication(doi_url, title, contributors, journal, published , subjects, num_citations, references, citations) - - - @staticmethod - def test_fetcher(): - pass diff --git a/input/get/acs_fj.py b/input/get/acs_fj.py deleted file mode 100755 index 28808382951156c919eecc4cfc827ea8059fe835..0000000000000000000000000000000000000000 --- a/input/get/acs_fj.py +++ /dev/null @@ -1,100 +0,0 @@ -#!/usr/bin/env python3 - -""" -Child class of JournalFetcher -JCIM -""" - -# import re -from input.get.journal_fetcher import JournalFetcher -from input.publication import Publication, Citation, Reference -import requests as req -from bs4 import BeautifulSoup as bs - -class Fetcher(JournalFetcher): - - """ - """ - - # TODO: Naming-Convention: - # Class: 'Fetcher' - # file: input_get_[journal-/organisation-name] - # format = "input_get_[a-z]*.py" allowed - # TODO: List of Compatable Journals - _SUPPORTED_JOURNALS = [] - - @staticmethod - def can_use_url(url: str) -> bool: - """ - Checks if given url links to a supported journal. - """ - - # TODO: Check the URL for compatability - # re.match in _SUPPORTED_JOURNALS - return True - - @staticmethod - def get_publication(url: str) -> Publication: - return input(url) - - - @staticmethod - def test_fetcher(): - pass - -def get_article_info(soup): - header = soup.find('div', class_ = 'article_header-left pull-left') - article_title = header.find('span', class_ = 'hlFld-Title').text - publication_date = header.find('span', class_ = 'pub-date-value').text - for link in header.find('div', class_ = 'article_header-doiurl'): - doi_url = link.get('href') - subs = header.find('div', class_ = 'article_header-taxonomy') - subjects = [] - for sub in subs.find_all('a'): - subjects.append(sub.get('title')) - cons = header.find('ul', class_ = 'loa') - contributors = [] - for con in cons.find_all('span', class_ = 'hlFld-ContribAuthor'): - contributors.append(con.text) - numc = header.find('div', class_ = 'articleMetrics_count') - if not numc.a: - num_citations = 0 - else: - num_citations = numc.a.text - pub = Publication(doi_url, article_title, contributors, "JCIM", - publication_date, subjects, num_citations) - #pub = Publication(article_title, publication_date, contributors, doi_url, - # subjects, num_citations) - return pub - - -def get_citation_info(pub, num_citations, soup): - details = soup.find('ol', class_ = 'cited-content_cbyCitation') - titles = [] - for title in details.find_all('span', - class_ = 'cited-content_cbyCitation_article-title'): - titles.append(title.text.replace('.', '')) - journal_names = [] - for name in details.find_all('span', - class_ = 'cited-content_cbyCitation_journal-name'): - journal_names.append(name.text) - doi_urls = [] - for url in details.find_all('a'): - doi_urls.append(url.get('href')) - # TODO: There are a few diffrent types how Contributors are listed - contributors = [] - for contrib in details.find_all('span', - class_ = 'cited-content_cbyCitation_article-contributors'): - contributors.append(contrib.text) - for i in range(0, int(num_citations)): - pub.citations.append(Citation(doi_urls[i], titles[i], journal_names[i], \ - contributors[i])) - -def input(url): - html_text = req.get(url).text - soup = bs(html_text, 'html.parser') - - pub = get_article_info(soup) - if int(pub.num_citations) > 0: - get_citation_info(pub, int(pub.num_citations), soup) - return pub diff --git a/input/get/journal_fetcher.py b/input/get/journal_fetcher.py index 097eb2456676bd06ab6ca697c80eed1b7c03c475..aad5857016a25c95b572ffdd32dfd69c67d67388 100755 --- a/input/get/journal_fetcher.py +++ b/input/get/journal_fetcher.py @@ -14,6 +14,7 @@ class JournalFetcher(metaclass=ABCMeta): """ This is a abstract-class for fetcher modules """ + @staticmethod def get_soup(url: str) -> BeautifulSoup: """ @@ -31,7 +32,8 @@ class JournalFetcher(metaclass=ABCMeta): raise SystemExit(err) return BeautifulSoup(req.content, 'html.parser') - + + @staticmethod @abstractmethod def can_use_url(url: str) -> bool: @@ -41,6 +43,7 @@ class JournalFetcher(metaclass=ABCMeta): """ raise AttributeError("JournalFetcher for '{}' hasnt implemented 'can_use_url()'".format(url)) + @staticmethod @abstractmethod def get_publication(url: str) -> Publication: @@ -49,12 +52,3 @@ class JournalFetcher(metaclass=ABCMeta): Creates a Publication-instance. """ raise AttributeError("JournalFetcher for '{}' hasnt implemented 'get_publication()'".format(url)) - - @staticmethod - @abstractmethod - def test_fetcher(): - """ - Abstract-function to be implemented in subclass. - Unit-test for the class. - """ - raise AttributeError("JournalFetcher: Subclass hasnt implemented 'test_fetcher()'") diff --git a/input/get/nature.py b/input/get/nature.py old mode 100755 new mode 100644 index d08d74f636687eb8510ad51167a9c9380272ed18..4d206f426d06575e1e7829ec8ec42631b3fe6493 --- a/input/get/nature.py +++ b/input/get/nature.py @@ -57,7 +57,3 @@ class Fetcher(JournalFetcher): # TODO: Exceptions-handling # raise ValueException("Cant Fetch: '{}'".format(error)) # return None - - @staticmethod - def test_fetcher(): - pass diff --git a/input/get/template_.py b/input/get/template_.py index 72f3cf913f8d9165d09fd897d0bb68ba9b30183a..da200bcfff5ea91445d476852f3763d97541dec9 100755 --- a/input/get/template_.py +++ b/input/get/template_.py @@ -31,7 +31,9 @@ class Fetcher(JournalFetcher): # TODO: Check the URL for compatability # url_re = re.match(r'(https?://)?(doi.org/)?(10.(\d{4})/\w+.\S+)', url) - # return url_re[4] in SUPPORTED_JOURNALS + # if url_re is not None: + # return url_re[4] in SUPPORTED_JOURNALS + # else: return False @staticmethod @@ -45,8 +47,4 @@ class Fetcher(JournalFetcher): # doi,title,contributors[],journal,publication_date,subjects[],references[],citations[] # TODO: Create new Publication-instance # return Publication(doi,title,contributors[],journal,publication_date,subjects[],num_citation=None ,references[],citations[]) - return None - - @staticmethod - def test_fetcher(): - pass + return None \ No newline at end of file diff --git a/input/publication.py b/input/publication.py index f819bedea7d1ad944e34d915dfc0027a21054cc8..6de23733972a02ced3fe41d015e52b39c4f404f3 100755 --- a/input/publication.py +++ b/input/publication.py @@ -4,10 +4,10 @@ class Publication: """ Represents a Publications """ - def __init__(self, doi_url: str, title: str - , contributors: str, journal: str - , publication_date: str, subjects = None, num_citations = None - , references = None, citations = None ): + def __init__(self, doi_url: str, title: str \ + , contributors: list[str], journal: str \ + , publication_date: str, subjects: list[str], num_citations: int = None \ + , references: list[any] = None, citations: list[any] = None ): """ Parameters ---------- @@ -72,7 +72,7 @@ class Publication: citation_string.append(citation.__str__()) return citation_string - def citations(self, citation) -> None: + def add_citations(self, citation) -> None: """ Appends a list of Citations or Citation to self.citations. @@ -132,9 +132,9 @@ Subjects:''') class Citation: - def __init__(self, doi_url: str, title: str - , journal: str, contributors = None - , cit_type = "Citation"): + def __init__(self, doi_url: str, title: str \ + , journal: str, contributors: list[str] \ + , cit_type: str = "Citation"): """ Parameters ---------- @@ -168,7 +168,7 @@ class Citation: # This is just a replica of Citations class Reference: - def __init__(self, doi_url, title, journal, contributors): + def __init__(self, doi_url: str, title: str, journal: str, contributors: list[str]): self.title = title self.doi_url = doi_url self.journal = journal diff --git a/input/tempdir/input_fj.py b/input/tempdir/input_fj.py deleted file mode 100755 index 00bb0126e2ae1abf6563bf99a16cc585b6d88077..0000000000000000000000000000000000000000 --- a/input/tempdir/input_fj.py +++ /dev/null @@ -1,144 +0,0 @@ -#!/usr/bin/env python3 -""" -Functions for information retrieval of articles from the ACS journal JCIM - -""" - -__author__ = "Florian Jochens" -__email__ = "fj@andaco.de" -__status__ = "Production" -#__copyright__ = "" -#__credits__ = ["", "", "", ""] -#__license__ = "" -#__version__ = "" -#__maintainer__ = "" - -from bs4 import BeautifulSoup as bs -import requests as req -import sys -from pathlib import Path - -class Publication: - #_registry = [] - _citations = [] - - def __init__(self, title, publication_date, contributors, doi_url, - subjects, num_citations): - #self._registry.append(self) - self.title = title - self.publication_date = publication_date - self.contributors = contributors - self.doi_url = doi_url - self.subjects = subjects - self.num_citations = num_citations - -class Citation: - def __init__(self, title, journal, contributors, doi_url): - self.title = title - self.journal = journal - self.contributors = contributors - self.doi_url = doi_url - -def get_article_info(soup): - header = soup.find('div', class_ = 'article_header-left pull-left') - article_title = header.find('span', class_ = 'hlFld-Title').text - publication_date = header.find('span', class_ = 'pub-date-value').text - for link in header.find('div', class_ = 'article_header-doiurl'): - doi_url = link.get('href') - subs = header.find('div', class_ = 'article_header-taxonomy') - subjects = [] - for sub in subs.find_all('a'): - subjects.append(sub.get('title')) - cons = header.find('ul', class_ = 'loa') - contributors = [] - for con in cons.find_all('span', class_ = 'hlFld-ContribAuthor'): - contributors.append(con.text) - numc = header.find('div', class_ = 'articleMetrics_count') - if not numc.a: - num_citations = 0 - else: - num_citations = numc.a.text - - pub = Publication(article_title, publication_date, contributors, doi_url, - subjects, num_citations) - return pub - -def get_download_url(): - export = soup.find('div', class_ = 'cit-download-dropdown_content') - url = 'https://pubs.acs.org' - for link in export.find_all('a'): - if link.get('title') == 'Citation and references': - url += link.get('href') - print(url) - return url - -def download(url): # Download citation and references file - if url.find('='): - filename = url.rsplit('=', 1)[1] - path = Path(('./files/' + filename)) - if path.is_file(): - print("File already exists") - else: - print("File does not exist") - -def get_citation_info(pub, num_citations, soup): - pub._citations = [] - details = soup.find('ol', class_ = 'cited-content_cbyCitation') - titles = [] - for title in details.find_all('span', - class_ = 'cited-content_cbyCitation_article-title'): - titles.append(title.text.replace('.', '')) - journal_names = [] - for name in details.find_all('span', - class_ = 'cited-content_cbyCitation_journal-name'): - journal_names.append(name.text) - doi_urls = [] - for url in details.find_all('a'): - doi_urls.append(url.get('href')) - contributors = [] - for contrib in details.find_all('span', - class_ = 'cited-content_cbyCitation_article-contributors'): - contributors.append(contrib.text) - for i in range(0, int(num_citations)): - pub._citations.append(Citation(titles[i], journal_names[i], - contributors[i], doi_urls[i])) -def print_pub_info(pub): - print(f'''Article title: {pub.title} -Publication date: {pub.publication_date} -DOI-URL: {pub.doi_url} - -Subjects:''') - print(*(pub.subjects), sep = ", ") - print('\nContributors:') - print(*(pub.contributors), sep = ", ") - - if int(pub.num_citations) > 0: - if int(pub.num_citations) == 1: - print(f'\nThis publication is cited by the following publication:\n') - else: - print(f'\nThis publication is cited by the following {pub.num_citations} publications:\n') - for citation in pub._citations: - print(f''' - Title: {citation.title} - Journal: {citation.journal} - Contributors: {citation.contributors} - DOI-URL: {citation.doi_url} - ''') - else: - print('\nThis publication is not cited by any other publication.') - -def input(url): - html_text = req.get(url).text - soup = bs(html_text, 'html.parser') - - pub = get_article_info(soup) - if int(pub.num_citations) > 0: - get_citation_info(pub, int(pub.num_citations), soup) - return pub - -#if len(sys.argv) != 2: -# sys.stderr.write('Usage: {} <url>\n'.format(sys.argv[0])) -# exit(1) -#url = sys.argv[1] -#pub = input(url) -#print_pub_info(pub) diff --git a/input/tempdir/pub.py b/input/tempdir/pub.py deleted file mode 100644 index 13b90e804cd485813b731385b319b3077a017dd2..0000000000000000000000000000000000000000 --- a/input/tempdir/pub.py +++ /dev/null @@ -1,32 +0,0 @@ -class Publication: - #_registry = [] - #_citations = [] - #_references = [] - - def __init__(self, title, publication_date, contributors, doi_url, - subjects, num_citations): - #self._registry.append(self) - self.title = title - self.publication_date = publication_date - self.contributors = contributors - self.doi_url = doi_url - self.subjects = subjects - self.num_citations = num_citations - self.num_references = num_references - self._citations = [] - self._references = [] - -class Citation: - def __init__(self, title, journal, contributors, doi_url): - self.title = title - self.journal = journal - self.contributors = contributors - self.doi_url = doi_url - -class References: - def __init__(self, title, journal, contributors, doi_url): - self.title = title - self.journal = journal - self.contributors = contributors - self.doi_url = doi_url - diff --git a/input/tempdir/test.py b/input/tempdir/test.py deleted file mode 100755 index bdd12e849ed5a239cadf5f8180d319a114512f9f..0000000000000000000000000000000000000000 --- a/input/tempdir/test.py +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env python3 - -from input_fj import input, print_pub_info -import sys - -if len(sys.argv) != 3: - sys.stderr.write('Usage: {} <url>\n'.format(sys.argv[0])) - exit(1) -url = sys.argv[1] -url2 = sys.argv[2] -pub = input(url) -print_pub_info(pub) -pub2 = input(url2) -print_pub_info(pub2) - diff --git a/input/test_input_get_publication.py b/input/test_input_get_publication.py deleted file mode 100755 index 941dbc76b5fb190ebf906ebfea8b60fbbbdd5d40..0000000000000000000000000000000000000000 --- a/input/test_input_get_publication.py +++ /dev/null @@ -1,28 +0,0 @@ -import unittest -""" -Testing the Publication fetcher - -Publication 1: 'https://doi.org/10.1021/acs.jcim.1c00203' -Publication 2: 'doi.org/10.1021/acs.jcim.1c00917' -Publication 3: '10.1038/nchem.1781' -Publication 4: '11.12/jaj' -Publication 5: '11.12/' -Publication 6: 'https://doi.org/10.1021/acs.jmedchem.0c01332' # Paper is a PDF -""" - - -class TestGetPublication(unittest.TestCase): - - def test_publication1(self): - pass - - def test_publication2(self): - pass - - def test_publication3(self): - pass - - -if __name__=="__main__": - print("test") - unittest.main() \ No newline at end of file