diff --git a/example_input.py b/example_input.py new file mode 100755 index 0000000000000000000000000000000000000000..a8331cb93912fb21a1d184ab97beda8996413c74 --- /dev/null +++ b/example_input.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python3 + +from input.interface import InputInterface +import input.publication + +def main(url: str): + #print(get_publication(url)) + print(InputInterface.get_publication(url)) + #pub.print_pub() + +if __name__ == "__main__": + #main("https://doi.org/10.1021/acs.jcim.1c00203") + #main("https://doi.org/10.1021/acs.jcim.1c00917") + main("https://doi.org/10.1021/acs.jcim.5b00332") diff --git a/input/README.md b/input/README.md index 76bd11d5d70daac13e190f4d52269eb381413c69..0ebd7e1c0a72a1cd23caf15f227e2b8c186a75b4 100644 --- a/input/README.md +++ b/input/README.md @@ -1,3 +1,27 @@ # Projekt CiS-Projekt 2021/22 -Input-Skripts +Input-Package to fetch publication information with a given url. + +## Usage/Examples + +```python +from input.interface import get_publication +from input.publication import Publication + +def main(url): + try: + pub = get_publication(url) + except Exception as error: + raise error + + print(pub) + pub.title = "Cool new Title" + print(pub) + +if __name__=="__main__": + main("https://doi.org/10.1021/acs.chemrev.8b00728") +``` +## Authors +- Florian Jochens +- Sam Ockenden +- Julius Schenk \ No newline at end of file diff --git a/input/__init__.py b/input/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..428d90609968105a7b37268496ac6a8d7bd08bc7 --- /dev/null +++ b/input/__init__.py @@ -0,0 +1,6 @@ +""" +init.py for Input-Package. +""" + +from input.publication import Publication +from input.interface import InputInterface diff --git a/input/__pycache__/__init__.cpython-39.pyc b/input/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c120e069d340aff9e417dc18dfa02e8284e91486 Binary files /dev/null and b/input/__pycache__/__init__.cpython-39.pyc differ diff --git a/input/__pycache__/interface.cpython-39.pyc b/input/__pycache__/interface.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..845c6597fc1a11f6533754364429af32ba2ae9aa Binary files /dev/null and b/input/__pycache__/interface.cpython-39.pyc differ diff --git a/input/__pycache__/publication.cpython-39.pyc b/input/__pycache__/publication.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ecb94eef74276d09e4ce203ffd67d6033002ae8a Binary files /dev/null and b/input/__pycache__/publication.cpython-39.pyc differ diff --git a/input/get/__init__.py b/input/get/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..2a6ddd0945b8e693ff80722a6c33084a49921c7f --- /dev/null +++ b/input/get/__init__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 +""" +__init__ for journalFetcher-module +temp file with nothing in it right now +""" +from input.publication import Publication +from input.get.journal_fetcher import JournalFetcher diff --git a/input/get/__pycache__/__init__.cpython-39.pyc b/input/get/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e6f5d85fed9093448393bae5028b8443ca4550c9 Binary files /dev/null and b/input/get/__pycache__/__init__.cpython-39.pyc differ diff --git a/input/get/__pycache__/acs.cpython-39.pyc b/input/get/__pycache__/acs.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c0f9cbd06555179aaf5d113b3567d24f4d08bebf Binary files /dev/null and b/input/get/__pycache__/acs.cpython-39.pyc differ diff --git a/input/get/__pycache__/journal_fetcher.cpython-39.pyc b/input/get/__pycache__/journal_fetcher.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..40e849f122b4d16639be285ad011e0852d92f38e Binary files /dev/null and b/input/get/__pycache__/journal_fetcher.cpython-39.pyc differ diff --git a/input/get/__pycache__/nature.cpython-39.pyc b/input/get/__pycache__/nature.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8c6ae402187f9b6c4bda32d8c3b57e41947bfce6 Binary files /dev/null and b/input/get/__pycache__/nature.cpython-39.pyc differ diff --git a/input/get/__pycache__/publication_interface.cpython-39.pyc b/input/get/__pycache__/publication_interface.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cd087d9c0a0a26e010a6faa45f267a8bccf438f3 Binary files /dev/null and b/input/get/__pycache__/publication_interface.cpython-39.pyc differ diff --git a/input/get/acs.py b/input/get/acs.py new file mode 100755 index 0000000000000000000000000000000000000000..3a54a1512944196285651315fc47a821a463e2d2 --- /dev/null +++ b/input/get/acs.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 + +""" +Child class of JournalFetcher +Usage: Check if Url can be used with 'can_use_url' + and then fetch publication with 'get_publication' +""" + +import re + +from input.get.journal_fetcher import JournalFetcher +from input.publication import Publication, Citation + + +class Fetcher(JournalFetcher): + """ + Specific Fetcher for the ACS journals. + """ + + # Constant for the abbreviations of the supported Journals + SUPPORTED_JOURNALS = ['1021'] + + @staticmethod + def can_use_url(url: str) -> bool: + """ + Uses Regex to extract journal specific substrings in Doi. + TODO: Support non Doi-urls + """ + matched_url = re.match(r'(https?://)?(doi.org/)?(10.(\d{4})/\w+.\S+)', url) + return matched_url[4] in Fetcher.SUPPORTED_JOURNALS + + @staticmethod + def get_publication(url: str) -> Publication: + """ + Fetches html and creates Beatifulsoup-instance in parent class. + Specific css-searches for ACS-Journals and creates Publication-instance. + """ + + # Creation of Soup + soup = JournalFetcher.get_soup(url) + soup_header = soup.select('.article_header')[0] + ref_cit_soup = soup + + # Creates Publication + doi_url = soup_header.select('a[title="DOI URL"]')[0].string + title = soup_header.select(".hlFld-Title")[0].text + + contributors = [] + for author in soup_header.select(".hlFld-ContribAuthor"): + contributors.append(author.text) + + journal = soup_header.select(".cit-title")[0].text + + published = soup_header.select(".pub-date-value")[0].text + + subjects = [] + subject_soup = soup_header.select('.article_header-taxonomy')[0] + for subject in subject_soup.select('a'): + subjects.append(subject.text) + + num_citations = 0 + + + references = [] + references_soup = ref_cit_soup.select('ol#references') + if references_soup != []: + for reference in references_soup[0].select('li'): + ref_doi = "https://doi.org/{}".format(reference.select('.refDoi')[0].text.strip()[5:])\ + if reference.select('.refDoi') != [] else "None" + ref_title = reference.select('.NLM_article-title')[0].text\ + if reference.select('.NLM_article-title') != [] else "None" + ref_journal = reference.select('i')[0].text\ + if reference.select('i') != [] else "None" + + ref_contributors=[] + for author in reference.select('.NLM_contrib-group'): + ref_contributors.append(author.text) + + references.append(Citation(ref_doi, ref_title, ref_journal, ref_contributors, cit_type="Reference")) + + citations = [] + citation_soup = ref_cit_soup.select('.cited-content_cbyCitation') + if citation_soup != []: + for citation in citation_soup[0].select('li'): + cit_doi = citation.select('a[title="DOI URL"]')[0].text\ + if citation.select('a[title="DOI URL"]') != [] else "None" + cit_title = citation.select('.cited-content_cbyCitation_article-title')[0].text\ + if citation.select('.cited-content_cbyCitation_article-title')!= [] else "None" + cit_journal = citation.select('.cited-content_cbyCitation_journal-name')[0].text\ + if citation.select('.cited-content_cbyCitation_journal-name') != [] else "None" + cit_contributors =[] + cit_contributors = citation.select('.cited-content_cbyCitation_article-contributors')[0].text.split(', ') + # clean up of the last Entry + cit_contributors_last = cit_contributors.pop().strip(". ") + if cit_contributors_last != '': + cit_contributors.append(cit_contributors_last) + citations.append(Citation(cit_doi, cit_title, cit_journal, cit_contributors, cit_type = "Citation")) + + return Publication(doi_url, title, contributors, journal, published + , subjects, num_citations, references, citations) + + + @staticmethod + def test_fetcher(): + pass diff --git a/input/get/acs_fj.py b/input/get/acs_fj.py new file mode 100755 index 0000000000000000000000000000000000000000..28808382951156c919eecc4cfc827ea8059fe835 --- /dev/null +++ b/input/get/acs_fj.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 + +""" +Child class of JournalFetcher +JCIM +""" + +# import re +from input.get.journal_fetcher import JournalFetcher +from input.publication import Publication, Citation, Reference +import requests as req +from bs4 import BeautifulSoup as bs + +class Fetcher(JournalFetcher): + + """ + """ + + # TODO: Naming-Convention: + # Class: 'Fetcher' + # file: input_get_[journal-/organisation-name] + # format = "input_get_[a-z]*.py" allowed + # TODO: List of Compatable Journals + _SUPPORTED_JOURNALS = [] + + @staticmethod + def can_use_url(url: str) -> bool: + """ + Checks if given url links to a supported journal. + """ + + # TODO: Check the URL for compatability + # re.match in _SUPPORTED_JOURNALS + return True + + @staticmethod + def get_publication(url: str) -> Publication: + return input(url) + + + @staticmethod + def test_fetcher(): + pass + +def get_article_info(soup): + header = soup.find('div', class_ = 'article_header-left pull-left') + article_title = header.find('span', class_ = 'hlFld-Title').text + publication_date = header.find('span', class_ = 'pub-date-value').text + for link in header.find('div', class_ = 'article_header-doiurl'): + doi_url = link.get('href') + subs = header.find('div', class_ = 'article_header-taxonomy') + subjects = [] + for sub in subs.find_all('a'): + subjects.append(sub.get('title')) + cons = header.find('ul', class_ = 'loa') + contributors = [] + for con in cons.find_all('span', class_ = 'hlFld-ContribAuthor'): + contributors.append(con.text) + numc = header.find('div', class_ = 'articleMetrics_count') + if not numc.a: + num_citations = 0 + else: + num_citations = numc.a.text + pub = Publication(doi_url, article_title, contributors, "JCIM", + publication_date, subjects, num_citations) + #pub = Publication(article_title, publication_date, contributors, doi_url, + # subjects, num_citations) + return pub + + +def get_citation_info(pub, num_citations, soup): + details = soup.find('ol', class_ = 'cited-content_cbyCitation') + titles = [] + for title in details.find_all('span', + class_ = 'cited-content_cbyCitation_article-title'): + titles.append(title.text.replace('.', '')) + journal_names = [] + for name in details.find_all('span', + class_ = 'cited-content_cbyCitation_journal-name'): + journal_names.append(name.text) + doi_urls = [] + for url in details.find_all('a'): + doi_urls.append(url.get('href')) + # TODO: There are a few diffrent types how Contributors are listed + contributors = [] + for contrib in details.find_all('span', + class_ = 'cited-content_cbyCitation_article-contributors'): + contributors.append(contrib.text) + for i in range(0, int(num_citations)): + pub.citations.append(Citation(doi_urls[i], titles[i], journal_names[i], \ + contributors[i])) + +def input(url): + html_text = req.get(url).text + soup = bs(html_text, 'html.parser') + + pub = get_article_info(soup) + if int(pub.num_citations) > 0: + get_citation_info(pub, int(pub.num_citations), soup) + return pub diff --git a/input/get/journal_fetcher.py b/input/get/journal_fetcher.py new file mode 100755 index 0000000000000000000000000000000000000000..097eb2456676bd06ab6ca697c80eed1b7c03c475 --- /dev/null +++ b/input/get/journal_fetcher.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +""" +Parent class for specific Journal +""" + +from abc import ABCMeta, abstractmethod +from bs4 import BeautifulSoup +import requests +from input.publication import Publication + + +class JournalFetcher(metaclass=ABCMeta): + """ + This is a abstract-class for fetcher modules + """ + @staticmethod + def get_soup(url: str) -> BeautifulSoup: + """ + Retrieves webside-html and returns a BeautifulSoup-instance + + Parameters: + ----------- + :type url: str + :param url: doi-url to a publication + :return: BeatifulSoup-instance + """ + try: + req = requests.get(url) + except requests.exceptions.HTTPError as err: + raise SystemExit(err) + + return BeautifulSoup(req.content, 'html.parser') + + @staticmethod + @abstractmethod + def can_use_url(url: str) -> bool: + """ + Abstract-function to be implemented in subclass. + Checks if given url links to a supported journal + """ + raise AttributeError("JournalFetcher for '{}' hasnt implemented 'can_use_url()'".format(url)) + + @staticmethod + @abstractmethod + def get_publication(url: str) -> Publication: + """ + Abstract-function to be implemented in subclass. + Creates a Publication-instance. + """ + raise AttributeError("JournalFetcher for '{}' hasnt implemented 'get_publication()'".format(url)) + + @staticmethod + @abstractmethod + def test_fetcher(): + """ + Abstract-function to be implemented in subclass. + Unit-test for the class. + """ + raise AttributeError("JournalFetcher: Subclass hasnt implemented 'test_fetcher()'") diff --git a/input/get/nature.py b/input/get/nature.py new file mode 100755 index 0000000000000000000000000000000000000000..d08d74f636687eb8510ad51167a9c9380272ed18 --- /dev/null +++ b/input/get/nature.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 + +""" +Child class of JournalFetcher +Usage: Check if Url can be used with 'can_use_url' + and then fetch publication with 'get_publication' +""" + +# import re +from input.get.journal_fetcher import JournalFetcher +from input.publication import Publication + + +class Fetcher(JournalFetcher): + + """ + scrapes publication metadata from a provided url + """ + + # TODO: List of Compatable Journals + # NOTE: nature does not use journal names in doi links, must match by 10.xxxx identifier instead + SUPPORTED_JOURNALS = [] + + @staticmethod + def can_use_url(url: str) -> bool: + """ + Checks if given url links to a supported journal. + """ + + # TODO: Check the URL for compatability + # re.match in SUPPORTED_JOURNALS + return False + + @staticmethod + def get_publication(url: str) -> Publication: + """ + Creates a Publication-instance. + """ + + soup = JournalFetcher.get_soup(url) + + _doi_url = "https://doi.org/" + soup.head.find(attrs={"name": "DOI"}).get("content") + _title = soup.head.find(attrs={"name": "citation_title"}).get("content") + _journal = soup.head.find(attrs={"name": "citation_journal_title"}).get("content") + _published = soup.head.find(attrs={"name": "prism.publicationDate"}).get("content") + _contributors = [] + _subjects = [] + + for creator in soup.head.findAll(attrs={"name": "dc.creator"}): + _contributors.append(creator.get("content")) + + for subject in soup.head.findAll(attrs={"name": "dc.subject"}): + _subjects.append(subject.get("content")) + + return Publication(_doi_url, _title, _contributors, _journal, _published, _subjects, 0) + + # TODO: Exceptions-handling + # raise ValueException("Cant Fetch: '{}'".format(error)) + # return None + + @staticmethod + def test_fetcher(): + pass diff --git a/input/get/template_.py b/input/get/template_.py new file mode 100755 index 0000000000000000000000000000000000000000..72f3cf913f8d9165d09fd897d0bb68ba9b30183a --- /dev/null +++ b/input/get/template_.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 + +""" +Child class of JournalFetcher +Usage: None, this is just a template and should be ignored +""" + +# import re +from input.get.journal_fetcher import JournalFetcher +from input.publication import Publication + + +class Fetcher(JournalFetcher): + + """ + This is only a template and therefore has no functionality + """ + + # TODO: Naming-Convention: + # Class: 'Fetcher' + # file: [journal-/organisation-name] + # format = "[a-z]*.py" allowed + # TODO: List of Compatable Journals + SUPPORTED_JOURNALS = [] + + @staticmethod + def can_use_url(url: str) -> bool: + """ + Checks if given url links to a supported journal. + """ + + # TODO: Check the URL for compatability + # url_re = re.match(r'(https?://)?(doi.org/)?(10.(\d{4})/\w+.\S+)', url) + # return url_re[4] in SUPPORTED_JOURNALS + return False + + @staticmethod + def get_publication(url: str) -> Publication: + """ + Creates a Publication-instance. + """ + + # TODO: Fetch data from the HTML + # soup = JournalFetcher.get_soup(url) + # doi,title,contributors[],journal,publication_date,subjects[],references[],citations[] + # TODO: Create new Publication-instance + # return Publication(doi,title,contributors[],journal,publication_date,subjects[],num_citation=None ,references[],citations[]) + return None + + @staticmethod + def test_fetcher(): + pass diff --git a/input/interface.py b/input/interface.py new file mode 100755 index 0000000000000000000000000000000000000000..c0d6df410ccaea3bbb01aad887b395a27d7ae160 --- /dev/null +++ b/input/interface.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 + +""" +Interface for the Input-Package only this should be accessed from outside this Package. + +""" +from os import walk +import importlib +import pathlib +import re +from input.publication import Publication + +class InputInterface: + """ + Singleton which dynamically imports and manages fetchers + """ + + get_path = None + fetcher_classes=[] + + @staticmethod + def get_publication(url: str) -> Publication: + """ + The interface-method to get a Publication-instance + + Parameters + ---------- + :param url: url to a Publication + :type url: str + :return: Publication instance or None if not supported + """ + # Initializes 'fetcher_classes', the list of imported modules + if InputInterface.fetcher_classes ==[]: + InputInterface.get_fetcher_classes() + if InputInterface.fetcher_classes ==[]: + raise AttributeError("No specific Fetchers where found at: '{}'" + .format(InputInterface.get_path)) + + # Checks if module supports the 'url' and returns a Publication if it does. + for fetcher_class in InputInterface.fetcher_classes: + if fetcher_class.can_use_url(url): + return fetcher_class.get_publication(url) + + # No Module for given url was found + return None + + + @staticmethod + def get_fetcher_classes(): + """ + Searches in 'get', if there are [a-z]*.py modules (specific Fetchers) + and tries to import them. + Saves found modules in 'fetcher_files'. + """ + + # Path to 'get'-package + InputInterface.get_path = '{}/get'.format(pathlib.Path(__file__).parent.resolve()) + + # Searches for modules with given Pattern + fetcher_file_names=[] + for file in next(walk(InputInterface.get_path), (None, None, []))[2]: + if re.match(r'[a-z]+.py', file) is not None: + fetcher_file_names.append(file) + + if fetcher_file_names !=[]: + print("Found following Modules: {}".format(", ".join(fetcher_file_names))) + + # Tries to import those modules and saves their 'Fetcher'-class + for file in fetcher_file_names: + try: + fetcher_class = importlib.import_module("input.get.{}".format(file[:-3])) + try: + InputInterface.fetcher_classes.append(fetcher_class.__getattribute__('Fetcher')) + except Exception as error: + print("Module '{}' does not have a 'Fetcher'-class".format(file[:-3])) + except Exception: + raise ImportError("Module '{}' can not be imported".format(file[:-3])) diff --git a/input/publication.py b/input/publication.py new file mode 100755 index 0000000000000000000000000000000000000000..792d779e75d151f7d8069ea6c59abb07265b5db5 --- /dev/null +++ b/input/publication.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 + +class Publication: + """ + Represents a Publications + """ + def __init__(self, doi_url: str, title: str + , contributors: str, journal: str + , publication_date: str, subjects: list[str], num_citations: int = None + , references: list[any] = None, citations: list[any] = None ): + """ + Parameters + ---------- + :param doi_url: doi_url of the publication + :type doi_url: str + :param title: title of the publication + :type title: str + :param contributors:list of all contributors + :type contributors: list[] + :param published: date of release + :type published: str + :param subjects: the subject of the Publication + :type subjects: list[str] + :param references: the Citation which is been referenced by this Publication + :type references: list[any] + :param citations: the Citation which references this Publication + :type citations: list[any] + :return: None + """ + self.doi_url = doi_url + self.title = title + self.contributors = contributors + self.journal = journal + self.publication_date = publication_date + self.subjects = subjects + if references is None: + self.references = [] + else: + self.references = references + if citations is None: + self.citations = [] + else: + self.citations = citations + if num_citations is None: + self.num_citations = len(self.citations) + else: + self.num_citations = num_citations # braucht man nicht einfach len(citations) + + + def __str__(self) -> str: + return ("Title: {}\n" + "Doi-url: {}\n" + "Authors: {}\n" + "Journal: {}\n" + "Published on: {}\n" + "Subjects: {}\n" + "References: \n{}\n" + "Citations: \n{}\n")\ + .format(self.title, self.doi_url, ", ".join(self.contributors) + , self.journal, self.publication_date + , ", ".join(self.subjects) + , "\n".join(self.get_citation_string(self.references)) + , "\n".join(self.get_citation_string(self.citations))) + + @staticmethod + def get_citation_string(citations): + if citations == []: + return ["None"] + else: + citation_string = [] + for citation in citations: + citation_string.append(citation.__str__()) + return citation_string + + def citations(self, citation) -> None: + """ + Appends a list of Citations or Citation to self.citations. + + Parameter + --------- + :param citation: Citation or Reference of the Publication + :type citation: Citation or list[Citation] + :return: self.citations + """ + if type(citation) is Citation: + self.citations.append(citation) + + # Checks if 'citation' is a list of Citations + elif type(citation) is list: + for _cit in citation: + if type(_cit) is Citation: + self.citations.append(_cit) + else: + raise TypeError("_set_citation expects Citations or List of Citations, not: '{}'" + .format(type(_cit))) + else: + raise TypeError("_set_citation expects Citations or List of Citations, not: '{}'" + .format(type(citation))) + + return self.citations + + def __eq__(self, other) -> bool: + """ Compares the unique doi_url of two Publications""" + return self.doi_url == other.doi_url + + def print_pub(self): + print(f'''Article title: {self.title} +Publication date: {self.publication_date} +DOI-URL: {self.doi_url} + +Subjects:''') + print(*(self.subjects), sep = ", ") + print('\nContributors:') + print(*(self.contributors), sep = ", ") + + if int(self.num_citations) > 0: + if int(self.num_citations) == 1: + print(f'\nThis publication is cited by the following publication:\n') + else: + print(f'\nThis publication is cited by the following {self.num_citations} publications:\n') + for citation in self.citations: + print(f''' + Title: {citation.title} + Journal: {citation.journal} + Contributors: {citation.contributors} + DOI-URL: {citation.doi_url} + ''') + else: + print('\nThis publication is not cited by any other publication.') + + + + +class Citation: + def __init__(self, doi_url: str, title: str + , journal: str, contributors: list[str] + , cit_type: str = "Citation"): + """ + Parameters + ---------- + :param doi_url: doi_url of the publication + :type doi_url: str + :param title: title of the publication + :type title: str + :param contributors: list of all contributors + :type contributors: list[str] + :param cit_type: Specifies if Reference or Citation + :type cit_type: str + :return: None + """ + + self.title = title + self.doi_url = doi_url + self.journal = journal + self.contributors = contributors + self.cit_type = cit_type + + def __str__(self) -> str: + return ("\t{}-Title: {}\n" + "\t{}-Doi: {}\n" + "\t{}-Journal: {}\n" + "\t{}-Contributors: {}\n")\ + .format(self.cit_type, self.title + , self.cit_type, self.doi_url + , self.cit_type, self.journal + , self.cit_type, ", ".join(self.contributors)) + + +# This is just a replica of Citations +class Reference: + def __init__(self, doi_url: str, title: str, journal: str, contributors: list[str]): + self.title = title + self.doi_url = doi_url + self.journal = journal + self.contributors = contributors + + def __str__(self) -> str: + return ("\tReferences-Title: {}\n" + "\tReferences-Doi: {}\n" + "\tReferences-Journal: {}\n" + "\tReferences-Contributors: {}")\ + .format(self.title, self.doi_url + , self.journal, ", ".join(self.contributors)) diff --git a/input/requirements.txt b/input/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..a151126691e7f0a9f1c824e9cbac243a96b32e71 --- /dev/null +++ b/input/requirements.txt @@ -0,0 +1,2 @@ +beautifulsoup4 +requests \ No newline at end of file diff --git a/input/tempdir/input_fj.py b/input/tempdir/input_fj.py new file mode 100755 index 0000000000000000000000000000000000000000..00bb0126e2ae1abf6563bf99a16cc585b6d88077 --- /dev/null +++ b/input/tempdir/input_fj.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +""" +Functions for information retrieval of articles from the ACS journal JCIM + +""" + +__author__ = "Florian Jochens" +__email__ = "fj@andaco.de" +__status__ = "Production" +#__copyright__ = "" +#__credits__ = ["", "", "", ""] +#__license__ = "" +#__version__ = "" +#__maintainer__ = "" + +from bs4 import BeautifulSoup as bs +import requests as req +import sys +from pathlib import Path + +class Publication: + #_registry = [] + _citations = [] + + def __init__(self, title, publication_date, contributors, doi_url, + subjects, num_citations): + #self._registry.append(self) + self.title = title + self.publication_date = publication_date + self.contributors = contributors + self.doi_url = doi_url + self.subjects = subjects + self.num_citations = num_citations + +class Citation: + def __init__(self, title, journal, contributors, doi_url): + self.title = title + self.journal = journal + self.contributors = contributors + self.doi_url = doi_url + +def get_article_info(soup): + header = soup.find('div', class_ = 'article_header-left pull-left') + article_title = header.find('span', class_ = 'hlFld-Title').text + publication_date = header.find('span', class_ = 'pub-date-value').text + for link in header.find('div', class_ = 'article_header-doiurl'): + doi_url = link.get('href') + subs = header.find('div', class_ = 'article_header-taxonomy') + subjects = [] + for sub in subs.find_all('a'): + subjects.append(sub.get('title')) + cons = header.find('ul', class_ = 'loa') + contributors = [] + for con in cons.find_all('span', class_ = 'hlFld-ContribAuthor'): + contributors.append(con.text) + numc = header.find('div', class_ = 'articleMetrics_count') + if not numc.a: + num_citations = 0 + else: + num_citations = numc.a.text + + pub = Publication(article_title, publication_date, contributors, doi_url, + subjects, num_citations) + return pub + +def get_download_url(): + export = soup.find('div', class_ = 'cit-download-dropdown_content') + url = 'https://pubs.acs.org' + for link in export.find_all('a'): + if link.get('title') == 'Citation and references': + url += link.get('href') + print(url) + return url + +def download(url): # Download citation and references file + if url.find('='): + filename = url.rsplit('=', 1)[1] + path = Path(('./files/' + filename)) + if path.is_file(): + print("File already exists") + else: + print("File does not exist") + +def get_citation_info(pub, num_citations, soup): + pub._citations = [] + details = soup.find('ol', class_ = 'cited-content_cbyCitation') + titles = [] + for title in details.find_all('span', + class_ = 'cited-content_cbyCitation_article-title'): + titles.append(title.text.replace('.', '')) + journal_names = [] + for name in details.find_all('span', + class_ = 'cited-content_cbyCitation_journal-name'): + journal_names.append(name.text) + doi_urls = [] + for url in details.find_all('a'): + doi_urls.append(url.get('href')) + contributors = [] + for contrib in details.find_all('span', + class_ = 'cited-content_cbyCitation_article-contributors'): + contributors.append(contrib.text) + for i in range(0, int(num_citations)): + pub._citations.append(Citation(titles[i], journal_names[i], + contributors[i], doi_urls[i])) +def print_pub_info(pub): + print(f'''Article title: {pub.title} +Publication date: {pub.publication_date} +DOI-URL: {pub.doi_url} + +Subjects:''') + print(*(pub.subjects), sep = ", ") + print('\nContributors:') + print(*(pub.contributors), sep = ", ") + + if int(pub.num_citations) > 0: + if int(pub.num_citations) == 1: + print(f'\nThis publication is cited by the following publication:\n') + else: + print(f'\nThis publication is cited by the following {pub.num_citations} publications:\n') + for citation in pub._citations: + print(f''' + Title: {citation.title} + Journal: {citation.journal} + Contributors: {citation.contributors} + DOI-URL: {citation.doi_url} + ''') + else: + print('\nThis publication is not cited by any other publication.') + +def input(url): + html_text = req.get(url).text + soup = bs(html_text, 'html.parser') + + pub = get_article_info(soup) + if int(pub.num_citations) > 0: + get_citation_info(pub, int(pub.num_citations), soup) + return pub + +#if len(sys.argv) != 2: +# sys.stderr.write('Usage: {} <url>\n'.format(sys.argv[0])) +# exit(1) +#url = sys.argv[1] +#pub = input(url) +#print_pub_info(pub) diff --git a/input/pub.py b/input/tempdir/pub.py similarity index 100% rename from input/pub.py rename to input/tempdir/pub.py diff --git a/input/tempdir/test.py b/input/tempdir/test.py new file mode 100755 index 0000000000000000000000000000000000000000..bdd12e849ed5a239cadf5f8180d319a114512f9f --- /dev/null +++ b/input/tempdir/test.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 + +from input_fj import input, print_pub_info +import sys + +if len(sys.argv) != 3: + sys.stderr.write('Usage: {} <url>\n'.format(sys.argv[0])) + exit(1) +url = sys.argv[1] +url2 = sys.argv[2] +pub = input(url) +print_pub_info(pub) +pub2 = input(url2) +print_pub_info(pub2) + diff --git a/input/test_doi.txt b/input/test_doi.txt new file mode 100644 index 0000000000000000000000000000000000000000..ced8c84e4036ec56cc0d5f9151bb0195f28e6b75 --- /dev/null +++ b/input/test_doi.txt @@ -0,0 +1,4 @@ +https://doi.org/10.1021/acs.jcim.1c00203 +https://doi.org/10.1021/acs.jcim.1c00917 +https://doi.org/10.1021/acs.jmedchem.0c01332 +10.1093/bioinformatics/btaa190 diff --git a/input/test_input_get_publication.py b/input/test_input_get_publication.py new file mode 100755 index 0000000000000000000000000000000000000000..941dbc76b5fb190ebf906ebfea8b60fbbbdd5d40 --- /dev/null +++ b/input/test_input_get_publication.py @@ -0,0 +1,28 @@ +import unittest +""" +Testing the Publication fetcher + +Publication 1: 'https://doi.org/10.1021/acs.jcim.1c00203' +Publication 2: 'doi.org/10.1021/acs.jcim.1c00917' +Publication 3: '10.1038/nchem.1781' +Publication 4: '11.12/jaj' +Publication 5: '11.12/' +Publication 6: 'https://doi.org/10.1021/acs.jmedchem.0c01332' # Paper is a PDF +""" + + +class TestGetPublication(unittest.TestCase): + + def test_publication1(self): + pass + + def test_publication2(self): + pass + + def test_publication3(self): + pass + + +if __name__=="__main__": + print("test") + unittest.main() \ No newline at end of file diff --git a/input_old/README.md b/input_old/README.md new file mode 100644 index 0000000000000000000000000000000000000000..76bd11d5d70daac13e190f4d52269eb381413c69 --- /dev/null +++ b/input_old/README.md @@ -0,0 +1,3 @@ +# Projekt CiS-Projekt 2021/22 +Input-Skripts + diff --git a/input/__pycache__/input_fj.cpython-39.pyc b/input_old/__pycache__/input_fj.cpython-39.pyc similarity index 100% rename from input/__pycache__/input_fj.cpython-39.pyc rename to input_old/__pycache__/input_fj.cpython-39.pyc diff --git a/input/example_urls b/input_old/example_urls similarity index 100% rename from input/example_urls rename to input_old/example_urls diff --git a/input_old/input_fj.py b/input_old/input_fj.py new file mode 100755 index 0000000000000000000000000000000000000000..ecc8e68fc5a84a446ae3f09dcb5ed56e8d262766 --- /dev/null +++ b/input_old/input_fj.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +""" +Functions for information retrieval of articles from the ACS journal JCIM + +""" + +__author__ = "Florian Jochens" +__email__ = "fj@andaco.de" +__status__ = "Production" +#__copyright__ = "" +#__credits__ = ["", "", "", ""] +#__license__ = "" +#__version__ = "" +#__maintainer__ = "" + +from bs4 import BeautifulSoup as bs +import requests as req +import sys +from pathlib import Path + +class Publication: + #_registry = [] + _citations = [] + _references = [] + + def __init__(self, title, publication_date, contributors, doi_url, + subjects = None, num_citations = None): + #self._registry.append(self) + self.title = title + self.publication_date = publication_date + self.contributors = contributors + self.doi_url = doi_url + self.subjects = subjects + self.num_citations = num_citations + #self._citations = [] + #self._references = [] + +class Citation: + def __init__(self, title, journal, contributors, doi_url): + self.title = title + self.journal = journal + self.contributors = contributors + self.doi_url = doi_url + +class References: + def __init__(self, title, journal, contributors, doi_url): + self.title = title + self.journal = journal + self.contributors = contributors + self.doi_url = doi_url + +def get_article_info(soup): + header = soup.find('div', class_ = 'article_header-left pull-left') + article_title = header.find('span', class_ = 'hlFld-Title').text + publication_date = header.find('span', class_ = 'pub-date-value').text + for link in header.find('div', class_ = 'article_header-doiurl'): + doi_url = link.get('href') + subs = header.find('div', class_ = 'article_header-taxonomy') + subjects = [] + for sub in subs.find_all('a'): + subjects.append(sub.get('title')) + cons = header.find('ul', class_ = 'loa') + contributors = [] + for con in cons.find_all('span', class_ = 'hlFld-ContribAuthor'): + contributors.append(con.text) + numc = header.find('div', class_ = 'articleMetrics_count') + if not numc.a: + num_citations = 0 + else: + num_citations = numc.a.text + + pub = Publication(article_title, publication_date, contributors, doi_url, + subjects, num_citations) + return pub + +def get_download_url(): + export = soup.find('div', class_ = 'cit-download-dropdown_content') + url = 'https://pubs.acs.org' + for link in export.find_all('a'): + if link.get('title') == 'Citation and references': + url += link.get('href') + print(url) + return url + +def download(url): # Download citation and references file + if url.find('='): + filename = url.rsplit('=', 1)[1] + path = Path(('./files/' + filename)) + if path.is_file(): + print("File already exists") + else: + print("File does not exist") + +def get_citation_info(pub, num_citations, soup): + pub._citations = [] + details = soup.find('ol', class_ = 'cited-content_cbyCitation') + titles = [] + for title in details.find_all('span', + class_ = 'cited-content_cbyCitation_article-title'): + titles.append(title.text.replace('.', '')) + journal_names = [] + for name in details.find_all('span', + class_ = 'cited-content_cbyCitation_journal-name'): + journal_names.append(name.text) + doi_urls = [] + for url in details.find_all('a'): + doi_urls.append(url.get('href')) + contributors = [] + for contrib in details.find_all('span', + class_ = 'cited-content_cbyCitation_article-contributors'): + contributors.append(contrib.text) + for i in range(0, int(num_citations)): + pub._citations.append(Citation(titles[i], journal_names[i], + contributors[i], doi_urls[i])) +def print_pub_info(pub): + print(f'''Article title: {pub.title} +Publication date: {pub.publication_date} +DOI-URL: {pub.doi_url} + +Subjects:''') + print(*(pub.subjects), sep = ", ") + print('\nContributors:') + print(*(pub.contributors), sep = ", ") + + if int(pub.num_citations) > 0: + if int(pub.num_citations) == 1: + print(f'\nThis publication is cited by the following publication:\n') + else: + print(f'\nThis publication is cited by the following {pub.num_citations} publications:\n') + for citation in pub._citations: + print(f''' + Title: {citation.title} + Journal: {citation.journal} + Contributors: {citation.contributors} + DOI-URL: {citation.doi_url} + ''') + else: + print('\nThis publication is not cited by any other publication.') + +def input(url): + html_text = req.get(url).text + soup = bs(html_text, 'html.parser') + + pub = get_article_info(soup) + if int(pub.num_citations) > 0: + get_citation_info(pub, int(pub.num_citations), soup) + return pub + +#if len(sys.argv) != 2: +# sys.stderr.write('Usage: {} <url>\n'.format(sys.argv[0])) +# exit(1) +#url = sys.argv[1] +#pub = input(url) +#print_pub_info(pub) diff --git a/input_old/pub.py b/input_old/pub.py new file mode 100644 index 0000000000000000000000000000000000000000..13b90e804cd485813b731385b319b3077a017dd2 --- /dev/null +++ b/input_old/pub.py @@ -0,0 +1,32 @@ +class Publication: + #_registry = [] + #_citations = [] + #_references = [] + + def __init__(self, title, publication_date, contributors, doi_url, + subjects, num_citations): + #self._registry.append(self) + self.title = title + self.publication_date = publication_date + self.contributors = contributors + self.doi_url = doi_url + self.subjects = subjects + self.num_citations = num_citations + self.num_references = num_references + self._citations = [] + self._references = [] + +class Citation: + def __init__(self, title, journal, contributors, doi_url): + self.title = title + self.journal = journal + self.contributors = contributors + self.doi_url = doi_url + +class References: + def __init__(self, title, journal, contributors, doi_url): + self.title = title + self.journal = journal + self.contributors = contributors + self.doi_url = doi_url + diff --git a/input/test.py b/input_old/test.py similarity index 100% rename from input/test.py rename to input_old/test.py diff --git a/input/x b/input_old/x similarity index 100% rename from input/x rename to input_old/x