Skip to content
Snippets Groups Projects
Select Git revision
  • bf170e19e1d191d01e396e6faebc531697bea3cc
  • main default protected
2 results

acs.py

  • Code owners
    Assign users and groups as approvers for specific file changes. Learn more.
    acs.py 9.05 KiB
    #!/usr/bin/env python3
    
    """
    Child class of JournalFetcher
    Usage: Check if Url can be used with 'can_use_url'
           and then fetch publication with 'get_publication'
    """
    
    import re
    
    from input.get.journal_fetcher import JournalFetcher
    from input.publication import Publication, Citation
    
    
    class AcsFetcher(JournalFetcher):
        """
        Specific Fetcher for the ACS-journals.
        """
    
        # Constant for the specific doi-url of the supported Journals
        SUPPORTED_JOURNALS = ['1021']
    
        @staticmethod
        def can_use_url(url: str) -> str:
            """
            Uses Regex to extract journal specific substrings in (Doi-)Urls.
            """
            matched_url = re.match(r'^(https?://)?(doi.org/|pubs.acs.org/doi/)?([a-z]+/)?(10.(\d{4})/\w+.\S+)', url.strip(". \t\r\n"))
            
            # Checks if match exists
            if matched_url is not None:
                return matched_url[5] in AcsFetcher.SUPPORTED_JOURNALS
            else:
                return False
    
    
        @staticmethod
        def get_pub_light(url: str) -> Publication:
            """
            Fetches html and creates Beatifulsoup-instance in parent class.
            Specific css-searches for ACS-Journals and creates Publication-instance (without References, Citations and abstract).
            """
    
            # Create soup
            try:
                soup = JournalFetcher.get_soup(url)
            except Exception as error:
                raise error
            
            # Raise Error if re recognizes Pattern, but url isnt correct:
            # - for other Urls
            if soup.text.strip(" \t\n")=="Missing resource null":
                raise ValueError("'{}' matches Pattern for 'ACS', but doesnt link to Paper.".format(url))
    
            # - for Dois
            if soup.title is not None:
                if soup.title.text == "Error: DOI Not Found":
                    raise ValueError("'{}' matches Pattern for 'ACS', but doesnt link to Paper.".format(url))
    
            # Presearch for a smaller soup
            soup_header = soup.select('.article_header')[0]
            
            # fetches info for publication
            doi_url = soup_header.select('a[title="DOI URL"]')[0].string
    
            title = soup_header.select(".hlFld-Title")[0].text
    
            contributors = []
            for author in soup_header.select(".hlFld-ContribAuthor"):
                contributors.append(author.text)
    
            journal = soup_header.select(".cit-title")[0].text
            # Replaces abbreviation with whole name
            if journal in JournalFetcher.abbrev_dict:
                journal = JournalFetcher.abbrev_dict[journal]
                    
            # Format in acs :"month dd, yyyy"
            published = soup_header.select(".pub-date-value")[0].text
            re_date = re.match(r'\s*(\w+) (\d+), (\d+)\s*',published)
            # dd.mm.yyyy
            if re_date is not None:
                published = (re_date[2].zfill(2) + "."
                            + JournalFetcher.mont_to_num[re_date[1].lower()]
                            + "." + re_date[3])
    
            subjects = ["None Found"]
            subject_soup = soup_header.select('.article_header-taxonomy')
            # Some Papers have no Subjects
            if subject_soup != []:
                subjects = []
                for subject in subject_soup[0].select('a'):
                    subjects.append(subject.text)
    
            return Publication(doi_url = doi_url,title = title, contributors = contributors\
                                , journal = journal, publication_date = published, subjects = subjects\
                                , references = None, citations = None, abstract = None)
    
        def get_publication(url: str) -> Publication:
            """
            Fetches html and creates Beatifulsoup-instance in parent class.
            Specific css-searches for ACS-Journals and creates Publication-instance.
            """
    
            # Create soup
            try:
                soup = JournalFetcher.get_soup(url)
            except Exception as error:
                raise error
            
            # Raise Error if re recognizes Pattern, but url isnt correct:
            # - for other Urls
            if soup.text.strip(" \t\n")=="Missing resource null":
                raise ValueError("'{}' matches Pattern for 'ACS', but doesnt link to Paper.".format(url))
    
            #  - for Dois
            if soup.title is not None:
                if soup.title.text == "Error: DOI Not Found":
                    raise ValueError("'{}' matches Pattern for 'ACS', but doesnt link to Paper.".format(url))
    
            # Presearch for a smaller soup
            soup_header = soup.select('.article_header')[0]
            
            # fetches info for publication
            doi_url = soup_header.select('a[title="DOI URL"]')[0].string
    
            title = soup_header.select(".hlFld-Title")[0].text
    
            contributors = []
            for author in soup_header.select(".hlFld-ContribAuthor"):
                contributors.append(author.text)
    
            journal = soup_header.select(".cit-title")[0].text
            # Replaces abbreviation with whole name
            if journal in JournalFetcher.abbrev_dict:
                journal = JournalFetcher.abbrev_dict[journal]
                    
            # Format in acs :"month dd, yyyy"
            published = soup_header.select(".pub-date-value")[0].text
            re_date = re.match(r'\s*(\w+) (\d+), (\d+)\s*',published)
            # dd.mm.yyyy
            if re_date is not None:
                published = (re_date[2].zfill(2) + "."
                            + JournalFetcher.mont_to_num[re_date[1].lower()]
                            + "." + re_date[3])
    
            subjects = ["None Found"]
            subject_soup = soup_header.select('.article_header-taxonomy')
            # Some Papers have no Subjects
            if subject_soup != []:
                subjects = []
                for subject in subject_soup[0].select('a'):
                    subjects.append(subject.text)
    
            abstract_soup = soup.select('.articleBody_abstractText')
            abstract = "Found Nothing"
            # Some Papers have no abstract in the html
            if abstract_soup != []:
                abstract = abstract_soup[0].text
    
    
            references = []
            references_soup = soup.select('ol#references')
            # Some Papers have no References in the html
            if references_soup != []:
                for reference in references_soup[0].select('li'):
                    if reference.select('.refDoi') != []:
                        ref_doi = "https://doi.org/{}".format(reference.select('.refDoi')[0].text.strip()[5:])
                    else: 
                    # Some references aren't Paper and have no Doi, we ignore those
                        continue
    
                    ref_title = reference.select('.NLM_article-title')[0].text\
                            if reference.select('.NLM_article-title') != [] else None
                    ref_journal = reference.select('i')[0].text\
                            if reference.select('i') != [] else None
    
                    # Replaces abbreviation with whole name
                    if ref_journal in JournalFetcher.abbrev_dict:
                        ref_journal = JournalFetcher.abbrev_dict[ref_journal]
                    
                    ref_contributors=[]
                    for author in reference.select('.NLM_contrib-group'):
                        ref_contributors.append(author.text.replace("\n", " ").replace("\r", ""))
    
                    references.append(Citation(ref_doi, ref_title, ref_journal, ref_contributors, cit_type="Reference"))
    
            citations = []
            citation_soup = soup.select('.cited-content_cbyCitation')
            # Some Papers have no Citations in the html
            if citation_soup != []:
                for citation in citation_soup[0].select('li'):
                    if citation.select('a[title="DOI URL"]') != []: 
                        cit_doi = citation.select('a[title="DOI URL"]')[0].text
                    else:
                    # Some citations aren't Paper and have no Doi, we ignore those
                        continue
    
                    cit_title = citation.select('.cited-content_cbyCitation_article-title')[0].text\
                            if citation.select('.cited-content_cbyCitation_article-title')!= [] else None
                    cit_journal = citation.select('.cited-content_cbyCitation_journal-name')[0].text\
                            if citation.select('.cited-content_cbyCitation_journal-name') != [] else None
    
                    # Replaces abbreviation with whole name
                    if cit_journal in JournalFetcher.abbrev_dict:
                        cit_journal = JournalFetcher.abbrev_dict[cit_journal]
                    
                    cit_contributors =[]
                    cit_contributors = citation.select('.cited-content_cbyCitation_article-contributors')[0]\
                        .text.replace("\n", " ").replace("\r", "").split(', ')
                    # clean up of the last Entry, because sometimes there is an extra ','
                    cit_contributors_last = cit_contributors.pop().strip(". ")
                    if cit_contributors_last != '':
                        cit_contributors.append(cit_contributors_last)
    
                    citations.append(Citation(cit_doi, cit_title, cit_journal, cit_contributors, cit_type = "Citation"))
    
            return Publication(doi_url = doi_url,title = title, contributors = contributors\
                            , journal = journal,publication_date = published,subjects = subjects\
                            ,references = references,citations = citations, abstract = abstract)