acs.py

#!/usr/bin/env python3

"""
Child class of JournalFetcher
Usage: Check if Url can be used with 'can_use_url'
       and then fetch publication with 'get_publication'
"""

import re

from input.get.journal_fetcher import JournalFetcher
from input.publication import Publication, Citation


class AcsFetcher(JournalFetcher):
    """
    Specific Fetcher for the ACS-journals.
    """

    # Constant for the specific doi-url of the supported Journals
    SUPPORTED_JOURNALS = ['1021']

    @staticmethod
    def can_use_url(url: str) -> str:
        """
        Uses Regex to extract journal specific substrings in (Doi-)Urls.
        """
        matched_url = re.match(r'^(https?://)?(doi.org/|pubs.acs.org/doi/)?([a-z]+/)?(10.(\d{4})/\w+.\S+)', url.strip(". \t\r\n"))

        # Checks if match exists
        if matched_url is not None:
            return matched_url[5] in AcsFetcher.SUPPORTED_JOURNALS
        else:
            return False


    @staticmethod
    def get_pub_light(url: str) -> Publication:
        """
        Fetches html and creates Beatifulsoup-instance in parent class.
        Specific css-searches for ACS-Journals and creates Publication-instance (without References, Citations and abstract).
        """

        # Create soup
        try:
            soup = JournalFetcher.get_soup(url)
        except Exception as error:
            raise error

        # Raise Error if re recognizes Pattern, but url isnt correct:
        # - for other Urls
        if soup.text.strip(" \t\n")=="Missing resource null":
            raise ValueError("'{}' matches Pattern for 'ACS', but doesnt link to Paper.".format(url))

        # - for Dois
        if soup.title is not None:
            if soup.title.text == "Error: DOI Not Found":
                raise ValueError("'{}' matches Pattern for 'ACS', but doesnt link to Paper.".format(url))

        # Presearch for a smaller soup
        soup_header = soup.select('.article_header')[0]

        # fetches info for publication
        doi_url = soup_header.select('a[title="DOI URL"]')[0].string

        title = soup_header.select(".hlFld-Title")[0].text

        contributors = []
        for author in soup_header.select(".hlFld-ContribAuthor"):
            contributors.append(author.text)

        journal = soup_header.select(".cit-title")[0].text
        # Replaces abbreviation with whole name
        if journal in JournalFetcher.abbrev_dict:
            journal = JournalFetcher.abbrev_dict[journal]

        # Format in acs :"month dd, yyyy"
        published = soup_header.select(".pub-date-value")[0].text
        re_date = re.match(r'\s*(\w+) (\d+), (\d+)\s*',published)
        # dd.mm.yyyy
        if re_date is not None:
            published = (re_date[2].zfill(2) + "."
                        + JournalFetcher.mont_to_num[re_date[1].lower()]
                        + "." + re_date[3])

        subjects = ["None Found"]
        subject_soup = soup_header.select('.article_header-taxonomy')
        # Some Papers have no Subjects
        if subject_soup != []:
            subjects = []
            for subject in subject_soup[0].select('a'):
                subjects.append(subject.text)

        return Publication(doi_url = doi_url,title = title, contributors = contributors\
                            , journal = journal, publication_date = published, subjects = subjects\
                            , references = None, citations = None, abstract = None)

    def get_publication(url: str) -> Publication:
        """
        Fetches html and creates Beatifulsoup-instance in parent class.
        Specific css-searches for ACS-Journals and creates Publication-instance.
        """

        # Create soup
        try:
            soup = JournalFetcher.get_soup(url)
        except Exception as error:
            raise error

        # Raise Error if re recognizes Pattern, but url isnt correct:
        # - for other Urls
        if soup.text.strip(" \t\n")=="Missing resource null":
            raise ValueError("'{}' matches Pattern for 'ACS', but doesnt link to Paper.".format(url))

        #  - for Dois
        if soup.title is not None:
            if soup.title.text == "Error: DOI Not Found":
                raise ValueError("'{}' matches Pattern for 'ACS', but doesnt link to Paper.".format(url))

        # Presearch for a smaller soup
        soup_header = soup.select('.article_header')[0]

        # fetches info for publication
        doi_url = soup_header.select('a[title="DOI URL"]')[0].string

        title = soup_header.select(".hlFld-Title")[0].text

        contributors = []
        for author in soup_header.select(".hlFld-ContribAuthor"):
            contributors.append(author.text)

        journal = soup_header.select(".cit-title")[0].text
        # Replaces abbreviation with whole name
        if journal in JournalFetcher.abbrev_dict:
            journal = JournalFetcher.abbrev_dict[journal]

        # Format in acs :"month dd, yyyy"
        published = soup_header.select(".pub-date-value")[0].text
        re_date = re.match(r'\s*(\w+) (\d+), (\d+)\s*',published)
        # dd.mm.yyyy
        if re_date is not None:
            published = (re_date[2].zfill(2) + "."
                        + JournalFetcher.mont_to_num[re_date[1].lower()]
                        + "." + re_date[3])

        subjects = ["None Found"]
        subject_soup = soup_header.select('.article_header-taxonomy')
        # Some Papers have no Subjects
        if subject_soup != []:
            subjects = []
            for subject in subject_soup[0].select('a'):
                subjects.append(subject.text)

        abstract_soup = soup.select('.articleBody_abstractText')
        abstract = "Found Nothing"
        # Some Papers have no abstract in the html
        if abstract_soup != []:
            abstract = abstract_soup[0].text


        references = []
        references_soup = soup.select('ol#references')
        # Some Papers have no References in the html
        if references_soup != []:
            for reference in references_soup[0].select('li'):
                if reference.select('.refDoi') != []:
                    ref_doi = "https://doi.org/{}".format(reference.select('.refDoi')[0].text.strip()[5:])
                else:
                # Some references aren't Paper and have no Doi, we ignore those
                    continue

                ref_title = reference.select('.NLM_article-title')[0].text\
                        if reference.select('.NLM_article-title') != [] else None
                ref_journal = reference.select('i')[0].text\
                        if reference.select('i') != [] else None

                # Replaces abbreviation with whole name
                if ref_journal in JournalFetcher.abbrev_dict:
                    ref_journal = JournalFetcher.abbrev_dict[ref_journal]

                ref_contributors=[]
                for author in reference.select('.NLM_contrib-group'):
                    ref_contributors.append(author.text.replace("\n", " ").replace("\r", ""))

                references.append(Citation(ref_doi, ref_title, ref_journal, ref_contributors, cit_type="Reference"))

        citations = []
        citation_soup = soup.select('.cited-content_cbyCitation')
        # Some Papers have no Citations in the html
        if citation_soup != []:
            for citation in citation_soup[0].select('li'):
                if citation.select('a[title="DOI URL"]') != []:
                    cit_doi = citation.select('a[title="DOI URL"]')[0].text
                else:
                # Some citations aren't Paper and have no Doi, we ignore those
                    continue

                cit_title = citation.select('.cited-content_cbyCitation_article-title')[0].text\
                        if citation.select('.cited-content_cbyCitation_article-title')!= [] else None
                cit_journal = citation.select('.cited-content_cbyCitation_journal-name')[0].text\
                        if citation.select('.cited-content_cbyCitation_journal-name') != [] else None

                # Replaces abbreviation with whole name
                if cit_journal in JournalFetcher.abbrev_dict:
                    cit_journal = JournalFetcher.abbrev_dict[cit_journal]

                cit_contributors =[]
                cit_contributors = citation.select('.cited-content_cbyCitation_article-contributors')[0]\
                    .text.replace("\n", " ").replace("\r", "").split(', ')
                # clean up of the last Entry, because sometimes there is an extra ','
                cit_contributors_last = cit_contributors.pop().strip(". ")
                if cit_contributors_last != '':
                    cit_contributors.append(cit_contributors_last)

                citations.append(Citation(cit_doi, cit_title, cit_journal, cit_contributors, cit_type = "Citation"))

        return Publication(doi_url = doi_url,title = title, contributors = contributors\
                        , journal = journal,publication_date = published,subjects = subjects\
                        ,references = references,citations = citations, abstract = abstract)