From 30836634a39a9a9f7d6d8889a5c61e0e31ea8c57 Mon Sep 17 00:00:00 2001 From: Florian Jochens <fj@andaco.de> Date: Wed, 1 Dec 2021 12:35:05 +0100 Subject: [PATCH] added get_pub_light function only fetches information for the given article and not it's citations and references --- example_input.py | 5 +++-- input/get/acs.py | 52 ++++++++++++++++++++++++++++++++++++++++++++++ input/interface.py | 28 +++++++++++++++++++++++-- 3 files changed, 81 insertions(+), 4 deletions(-) diff --git a/example_input.py b/example_input.py index 76eede0..c9bca41 100755 --- a/example_input.py +++ b/example_input.py @@ -4,9 +4,10 @@ from input.interface import InputInterface as Input def main(url: str): i = Input() - print(i.get_publication(url)) + #print(i.get_publication(url)) + print(i.get_pub_light(url)) # print(i.get_supported_fetchers()) Useless because all classes are called the same if __name__ == "__main__": #main("https://doi.org/10.1021/acs.jcim.1c0023") - main("https://doi.org/10.1021/acs.jcim.5b00332") + main("https://doi.org/10.1021/acs.jcim.5b00332") diff --git a/input/get/acs.py b/input/get/acs.py index 892cf98..9691845 100755 --- a/input/get/acs.py +++ b/input/get/acs.py @@ -35,6 +35,58 @@ class Fetcher(JournalFetcher): return False @staticmethod + + + def get_pub_light(url: str) -> Publication: + """ + Fetches html and creates Beatifulsoup-instance in parent class. + Specific css-searches for ACS-Journals and creates Publication-instance. + """ + + # Creation of Soup + try: + soup = JournalFetcher.get_soup(url) + except Exception as error: + raise error + + # Raise Error if re recognizes Pattern, but url isnt correct: + # For other Urls + if soup.text.strip(" \t\n")=="Missing resource null": + raise ValueError("'{}' matches Pattern for 'ACS', but doesnt link to Paper.".format(url)) + + # For Dois + if soup.title is not None: + if soup.title.text == "Error: DOI Not Found": + raise ValueError("'{}' matches Pattern for 'ACS', but doesnt link to Paper.".format(url)) + + + soup_header = soup.select('.article_header')[0] + + # Creates Publication + doi_url = soup_header.select('a[title="DOI URL"]')[0].string + title = soup_header.select(".hlFld-Title")[0].text + + contributors = [] + for author in soup_header.select(".hlFld-ContribAuthor"): + contributors.append(author.text) + + journal = soup_header.select(".cit-title")[0].text + + # Replaces abbreviation with whole name + if journal in JournalFetcher.abbrev_dict: + journal = JournalFetcher.abbrev_dict[journal] + + + published = soup_header.select(".pub-date-value")[0].text + + subjects = [] + subject_soup = soup_header.select('.article_header-taxonomy')[0] + for subject in subject_soup.select('a'): + subjects.append(subject.text) + + return Publication(doi_url, title, contributors, journal, published, + subjects) + def get_publication(url: str) -> Publication: """ Fetches html and creates Beatifulsoup-instance in parent class. diff --git a/input/interface.py b/input/interface.py index 8f0af9d..59515b3 100755 --- a/input/interface.py +++ b/input/interface.py @@ -41,6 +41,7 @@ class InputInterface: def get_publication(self, url: str) -> Publication: """ The interface-method to get a Publication-instance + (including it's citations and references) Parameters ---------- @@ -49,7 +50,8 @@ class InputInterface: :return: Publication instance or None if not supported """ - # Checks if module supports the 'url' and returns a Publication if it does. + # Checks if module supports the 'url' and + # returns a Publication if it does. for fetcher_class in InputInterface.fetcher_classes: if fetcher_class.can_use_url(url): return fetcher_class.get_publication(url) @@ -57,8 +59,30 @@ class InputInterface: # No Module for given url was found raise ValueError("'{}' is not supported".format(url)) + def get_pub_light(self, url: str) -> Publication: + """ + The interface-method to get a Publication-instance + (only for main article) + + Parameters + ---------- + :param url: url to a Publication + :type url: str + :return: Publication instance or None if not supported + """ + + # Checks if module supports the 'url' and + # returns a Publication if it does. + for fetcher_class in InputInterface.fetcher_classes: + if fetcher_class.can_use_url(url): + return fetcher_class.get_pub_light(url) + + # No Module for given url was found + raise ValueError("'{}' is not supported".format(url)) + def get_supported_fetchers(self): - # print(self.fetcher_classes[0].__name__) Useless right now, because all classes are called the same + # print(self.fetcher_classes[0].__name__) Useless right now, + # because all classes are called the same return [a.__name__ for a in self.fetcher_classes] def import_fetcher_classes(self): -- GitLab