Skip to content
Snippets Groups Projects
Select Git revision
  • 89a76703bc0f559f76c303a0151d2158d678ea98
  • master default protected
  • devel
  • adaptive_step_size
4 results

input_tab.go

Blame
  • Code owners
    Assign users and groups as approvers for specific file changes. Learn more.
    input_fj.py 4.99 KiB
    #!/usr/bin/env python3
    """
    Functions for information retrieval of articles from the ACS journal JCIM
    
    """
    
    __author__ = "Florian Jochens"
    __email__ = "fj@andaco.de"
    __status__ = "Production"
    #__copyright__ = ""
    #__credits__ = ["", "", "", ""]
    #__license__ = ""
    #__version__ = ""
    #__maintainer__ = ""
    
    from bs4 import BeautifulSoup as bs
    import requests as req
    import sys  
    from pathlib import Path
    
    class Publication:
        #_registry = []
        _citations = []
        _references = []
        
        def __init__(self, title, publication_date, contributors, doi_url, 
                     subjects = None, num_citations = None):
            #self._registry.append(self)
            self.title = title
            self.publication_date = publication_date
            self.contributors = contributors
            self.doi_url = doi_url
            self.subjects = subjects
            self.num_citations = num_citations
            #self._citations = []
            #self._references = []
    
    class Citation:
        def __init__(self, title, journal, contributors, doi_url):
            self.title = title
            self.journal = journal
            self.contributors = contributors
            self.doi_url = doi_url
    
    class References:
        def __init__(self, title, journal, contributors, doi_url):
            self.title = title
            self.journal = journal
            self.contributors = contributors
            self.doi_url = doi_url
        
    def get_article_info(soup):
        header = soup.find('div', class_ = 'article_header-left pull-left')
        article_title = header.find('span', class_ = 'hlFld-Title').text
        publication_date = header.find('span', class_ = 'pub-date-value').text
        for link in header.find('div', class_ = 'article_header-doiurl'):
            doi_url = link.get('href')
        subs = header.find('div', class_ = 'article_header-taxonomy')
        subjects = []
        for sub in subs.find_all('a'):
            subjects.append(sub.get('title'))
        cons = header.find('ul', class_ = 'loa')
        contributors = []
        for con in cons.find_all('span', class_ = 'hlFld-ContribAuthor'):
            contributors.append(con.text)
        numc = header.find('div', class_ = 'articleMetrics_count')
        if not numc.a:
            num_citations = 0
        else:
            num_citations = numc.a.text
    
        pub = Publication(article_title, publication_date, contributors, doi_url,
                          subjects, num_citations)
        return pub
    
    def get_download_url():
        export = soup.find('div', class_ = 'cit-download-dropdown_content')
        url = 'https://pubs.acs.org'
        for link in export.find_all('a'):
            if link.get('title') == 'Citation and references':
                url += link.get('href')     
        print(url)
        return url
    
    def download(url): # Download citation and references file
        if url.find('='):
            filename = url.rsplit('=', 1)[1]
        path = Path(('./files/' + filename))
        if path.is_file():
            print("File already exists")
        else:
            print("File does not exist")
    
    def get_citation_info(pub, num_citations, soup):
        pub._citations = []
        details = soup.find('ol', class_ = 'cited-content_cbyCitation')
        titles = [] 
        for title in details.find_all('span', 
                class_ = 'cited-content_cbyCitation_article-title'):
            titles.append(title.text.replace('.', ''))
        journal_names = []
        for name in details.find_all('span',
                class_ = 'cited-content_cbyCitation_journal-name'):
            journal_names.append(name.text)
        doi_urls = []
        for url in details.find_all('a'):
            doi_urls.append(url.get('href'))
        contributors = []
        for contrib in details.find_all('span', 
                class_ = 'cited-content_cbyCitation_article-contributors'):
            contributors.append(contrib.text)
        for i in range(0, int(num_citations)):
            pub._citations.append(Citation(titles[i], journal_names[i], 
                                  contributors[i], doi_urls[i]))
    def print_pub_info(pub):
        print(f'''Article title:    {pub.title}
    Publication date: {pub.publication_date}
    DOI-URL:          {pub.doi_url}
    
    Subjects:''')
        print(*(pub.subjects), sep = ", ")
        print('\nContributors:')
        print(*(pub.contributors), sep = ", ")
    
        if int(pub.num_citations) > 0:
            if int(pub.num_citations) == 1:
                print(f'\nThis publication is cited by the following publication:\n')
            else:
                print(f'\nThis publication is cited by the following {pub.num_citations} publications:\n')
            for citation in pub._citations:
                print(f'''
        Title:        {citation.title}
        Journal:      {citation.journal}
        Contributors: {citation.contributors}
        DOI-URL:      {citation.doi_url}
                ''')
        else:
            print('\nThis publication is not cited by any other publication.')
    
    def input(url):
        html_text = req.get(url).text
        soup = bs(html_text, 'html.parser')
        
        pub = get_article_info(soup)
        if int(pub.num_citations) > 0:
            get_citation_info(pub, int(pub.num_citations), soup)
        return pub
    
    #if len(sys.argv) != 2:
    #    sys.stderr.write('Usage: {} <url>\n'.format(sys.argv[0]))
    #    exit(1)
    #url = sys.argv[1]
    #pub = input(url)
    #print_pub_info(pub)