Select Git revision
acs.py
-
Stahl, Merle authoredStahl, Merle authored
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
acs.py 9.05 KiB
#!/usr/bin/env python3
"""
Child class of JournalFetcher
Usage: Check if Url can be used with 'can_use_url'
and then fetch publication with 'get_publication'
"""
import re
from input.get.journal_fetcher import JournalFetcher
from input.publication import Publication, Citation
class AcsFetcher(JournalFetcher):
"""
Specific Fetcher for the ACS-journals.
"""
# Constant for the specific doi-url of the supported Journals
SUPPORTED_JOURNALS = ['1021']
@staticmethod
def can_use_url(url: str) -> str:
"""
Uses Regex to extract journal specific substrings in (Doi-)Urls.
"""
matched_url = re.match(r'^(https?://)?(doi.org/|pubs.acs.org/doi/)?([a-z]+/)?(10.(\d{4})/\w+.\S+)', url.strip(". \t\r\n"))
# Checks if match exists
if matched_url is not None:
return matched_url[5] in AcsFetcher.SUPPORTED_JOURNALS
else:
return False
@staticmethod
def get_pub_light(url: str) -> Publication:
"""
Fetches html and creates Beatifulsoup-instance in parent class.
Specific css-searches for ACS-Journals and creates Publication-instance (without References, Citations and abstract).
"""
# Create soup
try:
soup = JournalFetcher.get_soup(url)
except Exception as error:
raise error
# Raise Error if re recognizes Pattern, but url isnt correct:
# - for other Urls
if soup.text.strip(" \t\n")=="Missing resource null":
raise ValueError("'{}' matches Pattern for 'ACS', but doesnt link to Paper.".format(url))
# - for Dois
if soup.title is not None:
if soup.title.text == "Error: DOI Not Found":
raise ValueError("'{}' matches Pattern for 'ACS', but doesnt link to Paper.".format(url))
# Presearch for a smaller soup
soup_header = soup.select('.article_header')[0]
# fetches info for publication
doi_url = soup_header.select('a[title="DOI URL"]')[0].string
title = soup_header.select(".hlFld-Title")[0].text
contributors = []
for author in soup_header.select(".hlFld-ContribAuthor"):
contributors.append(author.text)
journal = soup_header.select(".cit-title")[0].text
# Replaces abbreviation with whole name
if journal in JournalFetcher.abbrev_dict:
journal = JournalFetcher.abbrev_dict[journal]
# Format in acs :"month dd, yyyy"
published = soup_header.select(".pub-date-value")[0].text
re_date = re.match(r'\s*(\w+) (\d+), (\d+)\s*',published)
# dd.mm.yyyy
if re_date is not None:
published = (re_date[2].zfill(2) + "."
+ JournalFetcher.mont_to_num[re_date[1].lower()]
+ "." + re_date[3])
subjects = ["None Found"]
subject_soup = soup_header.select('.article_header-taxonomy')
# Some Papers have no Subjects
if subject_soup != []:
subjects = []
for subject in subject_soup[0].select('a'):
subjects.append(subject.text)
return Publication(doi_url = doi_url,title = title, contributors = contributors\
, journal = journal, publication_date = published, subjects = subjects\
, references = None, citations = None, abstract = None)
def get_publication(url: str) -> Publication:
"""
Fetches html and creates Beatifulsoup-instance in parent class.
Specific css-searches for ACS-Journals and creates Publication-instance.
"""
# Create soup
try:
soup = JournalFetcher.get_soup(url)
except Exception as error:
raise error
# Raise Error if re recognizes Pattern, but url isnt correct:
# - for other Urls
if soup.text.strip(" \t\n")=="Missing resource null":
raise ValueError("'{}' matches Pattern for 'ACS', but doesnt link to Paper.".format(url))
# - for Dois
if soup.title is not None:
if soup.title.text == "Error: DOI Not Found":
raise ValueError("'{}' matches Pattern for 'ACS', but doesnt link to Paper.".format(url))
# Presearch for a smaller soup
soup_header = soup.select('.article_header')[0]
# fetches info for publication
doi_url = soup_header.select('a[title="DOI URL"]')[0].string
title = soup_header.select(".hlFld-Title")[0].text
contributors = []
for author in soup_header.select(".hlFld-ContribAuthor"):
contributors.append(author.text)
journal = soup_header.select(".cit-title")[0].text
# Replaces abbreviation with whole name
if journal in JournalFetcher.abbrev_dict:
journal = JournalFetcher.abbrev_dict[journal]
# Format in acs :"month dd, yyyy"
published = soup_header.select(".pub-date-value")[0].text
re_date = re.match(r'\s*(\w+) (\d+), (\d+)\s*',published)
# dd.mm.yyyy
if re_date is not None:
published = (re_date[2].zfill(2) + "."
+ JournalFetcher.mont_to_num[re_date[1].lower()]
+ "." + re_date[3])
subjects = ["None Found"]
subject_soup = soup_header.select('.article_header-taxonomy')
# Some Papers have no Subjects
if subject_soup != []:
subjects = []
for subject in subject_soup[0].select('a'):
subjects.append(subject.text)
abstract_soup = soup.select('.articleBody_abstractText')
abstract = "Found Nothing"
# Some Papers have no abstract in the html
if abstract_soup != []:
abstract = abstract_soup[0].text
references = []
references_soup = soup.select('ol#references')
# Some Papers have no References in the html
if references_soup != []:
for reference in references_soup[0].select('li'):
if reference.select('.refDoi') != []:
ref_doi = "https://doi.org/{}".format(reference.select('.refDoi')[0].text.strip()[5:])
else:
# Some references aren't Paper and have no Doi, we ignore those
continue
ref_title = reference.select('.NLM_article-title')[0].text\
if reference.select('.NLM_article-title') != [] else None
ref_journal = reference.select('i')[0].text\
if reference.select('i') != [] else None
# Replaces abbreviation with whole name
if ref_journal in JournalFetcher.abbrev_dict:
ref_journal = JournalFetcher.abbrev_dict[ref_journal]
ref_contributors=[]
for author in reference.select('.NLM_contrib-group'):
ref_contributors.append(author.text.replace("\n", " ").replace("\r", ""))
references.append(Citation(ref_doi, ref_title, ref_journal, ref_contributors, cit_type="Reference"))
citations = []
citation_soup = soup.select('.cited-content_cbyCitation')
# Some Papers have no Citations in the html
if citation_soup != []:
for citation in citation_soup[0].select('li'):
if citation.select('a[title="DOI URL"]') != []:
cit_doi = citation.select('a[title="DOI URL"]')[0].text
else:
# Some citations aren't Paper and have no Doi, we ignore those
continue
cit_title = citation.select('.cited-content_cbyCitation_article-title')[0].text\
if citation.select('.cited-content_cbyCitation_article-title')!= [] else None
cit_journal = citation.select('.cited-content_cbyCitation_journal-name')[0].text\
if citation.select('.cited-content_cbyCitation_journal-name') != [] else None
# Replaces abbreviation with whole name
if cit_journal in JournalFetcher.abbrev_dict:
cit_journal = JournalFetcher.abbrev_dict[cit_journal]
cit_contributors =[]
cit_contributors = citation.select('.cited-content_cbyCitation_article-contributors')[0]\
.text.replace("\n", " ").replace("\r", "").split(', ')
# clean up of the last Entry, because sometimes there is an extra ','
cit_contributors_last = cit_contributors.pop().strip(". ")
if cit_contributors_last != '':
cit_contributors.append(cit_contributors_last)
citations.append(Citation(cit_doi, cit_title, cit_journal, cit_contributors, cit_type = "Citation"))
return Publication(doi_url = doi_url,title = title, contributors = contributors\
, journal = journal,publication_date = published,subjects = subjects\
,references = references,citations = citations, abstract = abstract)