Skip to content
Snippets Groups Projects
Commit a07147ed authored by Florian Jochens's avatar Florian Jochens
Browse files

added updated input files

parent 266e121b
No related branches found
No related tags found
No related merge requests found
Showing
with 694 additions and 1 deletion
#!/usr/bin/env python3
from input.interface import InputInterface
import input.publication
def main(url: str):
#print(get_publication(url))
print(InputInterface.get_publication(url))
#pub.print_pub()
if __name__ == "__main__":
#main("https://doi.org/10.1021/acs.jcim.1c00203")
#main("https://doi.org/10.1021/acs.jcim.1c00917")
main("https://doi.org/10.1021/acs.jcim.5b00332")
# Projekt CiS-Projekt 2021/22 # Projekt CiS-Projekt 2021/22
Input-Skripts
Input-Package to fetch publication information with a given url.
## Usage/Examples
```python
from input.interface import get_publication
from input.publication import Publication
def main(url):
try:
pub = get_publication(url)
except Exception as error:
raise error
print(pub)
pub.title = "Cool new Title"
print(pub)
if __name__=="__main__":
main("https://doi.org/10.1021/acs.chemrev.8b00728")
```
## Authors
- Florian Jochens
- Sam Ockenden
- Julius Schenk
\ No newline at end of file
"""
init.py for Input-Package.
"""
from input.publication import Publication
from input.interface import InputInterface
File added
File added
File added
#!/usr/bin/env python3
"""
__init__ for journalFetcher-module
temp file with nothing in it right now
"""
from input.publication import Publication
from input.get.journal_fetcher import JournalFetcher
File added
File added
File added
File added
File added
#!/usr/bin/env python3
"""
Child class of JournalFetcher
Usage: Check if Url can be used with 'can_use_url'
and then fetch publication with 'get_publication'
"""
import re
from input.get.journal_fetcher import JournalFetcher
from input.publication import Publication, Citation
class Fetcher(JournalFetcher):
"""
Specific Fetcher for the ACS journals.
"""
# Constant for the abbreviations of the supported Journals
SUPPORTED_JOURNALS = ['1021']
@staticmethod
def can_use_url(url: str) -> bool:
"""
Uses Regex to extract journal specific substrings in Doi.
TODO: Support non Doi-urls
"""
matched_url = re.match(r'(https?://)?(doi.org/)?(10.(\d{4})/\w+.\S+)', url)
return matched_url[4] in Fetcher.SUPPORTED_JOURNALS
@staticmethod
def get_publication(url: str) -> Publication:
"""
Fetches html and creates Beatifulsoup-instance in parent class.
Specific css-searches for ACS-Journals and creates Publication-instance.
"""
# Creation of Soup
soup = JournalFetcher.get_soup(url)
soup_header = soup.select('.article_header')[0]
ref_cit_soup = soup
# Creates Publication
doi_url = soup_header.select('a[title="DOI URL"]')[0].string
title = soup_header.select(".hlFld-Title")[0].text
contributors = []
for author in soup_header.select(".hlFld-ContribAuthor"):
contributors.append(author.text)
journal = soup_header.select(".cit-title")[0].text
published = soup_header.select(".pub-date-value")[0].text
subjects = []
subject_soup = soup_header.select('.article_header-taxonomy')[0]
for subject in subject_soup.select('a'):
subjects.append(subject.text)
num_citations = 0
references = []
references_soup = ref_cit_soup.select('ol#references')
if references_soup != []:
for reference in references_soup[0].select('li'):
ref_doi = "https://doi.org/{}".format(reference.select('.refDoi')[0].text.strip()[5:])\
if reference.select('.refDoi') != [] else "None"
ref_title = reference.select('.NLM_article-title')[0].text\
if reference.select('.NLM_article-title') != [] else "None"
ref_journal = reference.select('i')[0].text\
if reference.select('i') != [] else "None"
ref_contributors=[]
for author in reference.select('.NLM_contrib-group'):
ref_contributors.append(author.text)
references.append(Citation(ref_doi, ref_title, ref_journal, ref_contributors, cit_type="Reference"))
citations = []
citation_soup = ref_cit_soup.select('.cited-content_cbyCitation')
if citation_soup != []:
for citation in citation_soup[0].select('li'):
cit_doi = citation.select('a[title="DOI URL"]')[0].text\
if citation.select('a[title="DOI URL"]') != [] else "None"
cit_title = citation.select('.cited-content_cbyCitation_article-title')[0].text\
if citation.select('.cited-content_cbyCitation_article-title')!= [] else "None"
cit_journal = citation.select('.cited-content_cbyCitation_journal-name')[0].text\
if citation.select('.cited-content_cbyCitation_journal-name') != [] else "None"
cit_contributors =[]
cit_contributors = citation.select('.cited-content_cbyCitation_article-contributors')[0].text.split(', ')
# clean up of the last Entry
cit_contributors_last = cit_contributors.pop().strip(". ")
if cit_contributors_last != '':
cit_contributors.append(cit_contributors_last)
citations.append(Citation(cit_doi, cit_title, cit_journal, cit_contributors, cit_type = "Citation"))
return Publication(doi_url, title, contributors, journal, published
, subjects, num_citations, references, citations)
@staticmethod
def test_fetcher():
pass
#!/usr/bin/env python3
"""
Child class of JournalFetcher
JCIM
"""
# import re
from input.get.journal_fetcher import JournalFetcher
from input.publication import Publication, Citation, Reference
import requests as req
from bs4 import BeautifulSoup as bs
class Fetcher(JournalFetcher):
"""
"""
# TODO: Naming-Convention:
# Class: 'Fetcher'
# file: input_get_[journal-/organisation-name]
# format = "input_get_[a-z]*.py" allowed
# TODO: List of Compatable Journals
_SUPPORTED_JOURNALS = []
@staticmethod
def can_use_url(url: str) -> bool:
"""
Checks if given url links to a supported journal.
"""
# TODO: Check the URL for compatability
# re.match in _SUPPORTED_JOURNALS
return True
@staticmethod
def get_publication(url: str) -> Publication:
return input(url)
@staticmethod
def test_fetcher():
pass
def get_article_info(soup):
header = soup.find('div', class_ = 'article_header-left pull-left')
article_title = header.find('span', class_ = 'hlFld-Title').text
publication_date = header.find('span', class_ = 'pub-date-value').text
for link in header.find('div', class_ = 'article_header-doiurl'):
doi_url = link.get('href')
subs = header.find('div', class_ = 'article_header-taxonomy')
subjects = []
for sub in subs.find_all('a'):
subjects.append(sub.get('title'))
cons = header.find('ul', class_ = 'loa')
contributors = []
for con in cons.find_all('span', class_ = 'hlFld-ContribAuthor'):
contributors.append(con.text)
numc = header.find('div', class_ = 'articleMetrics_count')
if not numc.a:
num_citations = 0
else:
num_citations = numc.a.text
pub = Publication(doi_url, article_title, contributors, "JCIM",
publication_date, subjects, num_citations)
#pub = Publication(article_title, publication_date, contributors, doi_url,
# subjects, num_citations)
return pub
def get_citation_info(pub, num_citations, soup):
details = soup.find('ol', class_ = 'cited-content_cbyCitation')
titles = []
for title in details.find_all('span',
class_ = 'cited-content_cbyCitation_article-title'):
titles.append(title.text.replace('.', ''))
journal_names = []
for name in details.find_all('span',
class_ = 'cited-content_cbyCitation_journal-name'):
journal_names.append(name.text)
doi_urls = []
for url in details.find_all('a'):
doi_urls.append(url.get('href'))
# TODO: There are a few diffrent types how Contributors are listed
contributors = []
for contrib in details.find_all('span',
class_ = 'cited-content_cbyCitation_article-contributors'):
contributors.append(contrib.text)
for i in range(0, int(num_citations)):
pub.citations.append(Citation(doi_urls[i], titles[i], journal_names[i], \
contributors[i]))
def input(url):
html_text = req.get(url).text
soup = bs(html_text, 'html.parser')
pub = get_article_info(soup)
if int(pub.num_citations) > 0:
get_citation_info(pub, int(pub.num_citations), soup)
return pub
#!/usr/bin/env python3
"""
Parent class for specific Journal
"""
from abc import ABCMeta, abstractmethod
from bs4 import BeautifulSoup
import requests
from input.publication import Publication
class JournalFetcher(metaclass=ABCMeta):
"""
This is a abstract-class for fetcher modules
"""
@staticmethod
def get_soup(url: str) -> BeautifulSoup:
"""
Retrieves webside-html and returns a BeautifulSoup-instance
Parameters:
-----------
:type url: str
:param url: doi-url to a publication
:return: BeatifulSoup-instance
"""
try:
req = requests.get(url)
except requests.exceptions.HTTPError as err:
raise SystemExit(err)
return BeautifulSoup(req.content, 'html.parser')
@staticmethod
@abstractmethod
def can_use_url(url: str) -> bool:
"""
Abstract-function to be implemented in subclass.
Checks if given url links to a supported journal
"""
raise AttributeError("JournalFetcher for '{}' hasnt implemented 'can_use_url()'".format(url))
@staticmethod
@abstractmethod
def get_publication(url: str) -> Publication:
"""
Abstract-function to be implemented in subclass.
Creates a Publication-instance.
"""
raise AttributeError("JournalFetcher for '{}' hasnt implemented 'get_publication()'".format(url))
@staticmethod
@abstractmethod
def test_fetcher():
"""
Abstract-function to be implemented in subclass.
Unit-test for the class.
"""
raise AttributeError("JournalFetcher: Subclass hasnt implemented 'test_fetcher()'")
#!/usr/bin/env python3
"""
Child class of JournalFetcher
Usage: Check if Url can be used with 'can_use_url'
and then fetch publication with 'get_publication'
"""
# import re
from input.get.journal_fetcher import JournalFetcher
from input.publication import Publication
class Fetcher(JournalFetcher):
"""
scrapes publication metadata from a provided url
"""
# TODO: List of Compatable Journals
# NOTE: nature does not use journal names in doi links, must match by 10.xxxx identifier instead
SUPPORTED_JOURNALS = []
@staticmethod
def can_use_url(url: str) -> bool:
"""
Checks if given url links to a supported journal.
"""
# TODO: Check the URL for compatability
# re.match in SUPPORTED_JOURNALS
return False
@staticmethod
def get_publication(url: str) -> Publication:
"""
Creates a Publication-instance.
"""
soup = JournalFetcher.get_soup(url)
_doi_url = "https://doi.org/" + soup.head.find(attrs={"name": "DOI"}).get("content")
_title = soup.head.find(attrs={"name": "citation_title"}).get("content")
_journal = soup.head.find(attrs={"name": "citation_journal_title"}).get("content")
_published = soup.head.find(attrs={"name": "prism.publicationDate"}).get("content")
_contributors = []
_subjects = []
for creator in soup.head.findAll(attrs={"name": "dc.creator"}):
_contributors.append(creator.get("content"))
for subject in soup.head.findAll(attrs={"name": "dc.subject"}):
_subjects.append(subject.get("content"))
return Publication(_doi_url, _title, _contributors, _journal, _published, _subjects, 0)
# TODO: Exceptions-handling
# raise ValueException("Cant Fetch: '{}'".format(error))
# return None
@staticmethod
def test_fetcher():
pass
#!/usr/bin/env python3
"""
Child class of JournalFetcher
Usage: None, this is just a template and should be ignored
"""
# import re
from input.get.journal_fetcher import JournalFetcher
from input.publication import Publication
class Fetcher(JournalFetcher):
"""
This is only a template and therefore has no functionality
"""
# TODO: Naming-Convention:
# Class: 'Fetcher'
# file: [journal-/organisation-name]
# format = "[a-z]*.py" allowed
# TODO: List of Compatable Journals
SUPPORTED_JOURNALS = []
@staticmethod
def can_use_url(url: str) -> bool:
"""
Checks if given url links to a supported journal.
"""
# TODO: Check the URL for compatability
# url_re = re.match(r'(https?://)?(doi.org/)?(10.(\d{4})/\w+.\S+)', url)
# return url_re[4] in SUPPORTED_JOURNALS
return False
@staticmethod
def get_publication(url: str) -> Publication:
"""
Creates a Publication-instance.
"""
# TODO: Fetch data from the HTML
# soup = JournalFetcher.get_soup(url)
# doi,title,contributors[],journal,publication_date,subjects[],references[],citations[]
# TODO: Create new Publication-instance
# return Publication(doi,title,contributors[],journal,publication_date,subjects[],num_citation=None ,references[],citations[])
return None
@staticmethod
def test_fetcher():
pass
#!/usr/bin/env python3
"""
Interface for the Input-Package only this should be accessed from outside this Package.
"""
from os import walk
import importlib
import pathlib
import re
from input.publication import Publication
class InputInterface:
"""
Singleton which dynamically imports and manages fetchers
"""
get_path = None
fetcher_classes=[]
@staticmethod
def get_publication(url: str) -> Publication:
"""
The interface-method to get a Publication-instance
Parameters
----------
:param url: url to a Publication
:type url: str
:return: Publication instance or None if not supported
"""
# Initializes 'fetcher_classes', the list of imported modules
if InputInterface.fetcher_classes ==[]:
InputInterface.get_fetcher_classes()
if InputInterface.fetcher_classes ==[]:
raise AttributeError("No specific Fetchers where found at: '{}'"
.format(InputInterface.get_path))
# Checks if module supports the 'url' and returns a Publication if it does.
for fetcher_class in InputInterface.fetcher_classes:
if fetcher_class.can_use_url(url):
return fetcher_class.get_publication(url)
# No Module for given url was found
return None
@staticmethod
def get_fetcher_classes():
"""
Searches in 'get', if there are [a-z]*.py modules (specific Fetchers)
and tries to import them.
Saves found modules in 'fetcher_files'.
"""
# Path to 'get'-package
InputInterface.get_path = '{}/get'.format(pathlib.Path(__file__).parent.resolve())
# Searches for modules with given Pattern
fetcher_file_names=[]
for file in next(walk(InputInterface.get_path), (None, None, []))[2]:
if re.match(r'[a-z]+.py', file) is not None:
fetcher_file_names.append(file)
if fetcher_file_names !=[]:
print("Found following Modules: {}".format(", ".join(fetcher_file_names)))
# Tries to import those modules and saves their 'Fetcher'-class
for file in fetcher_file_names:
try:
fetcher_class = importlib.import_module("input.get.{}".format(file[:-3]))
try:
InputInterface.fetcher_classes.append(fetcher_class.__getattribute__('Fetcher'))
except Exception as error:
print("Module '{}' does not have a 'Fetcher'-class".format(file[:-3]))
except Exception:
raise ImportError("Module '{}' can not be imported".format(file[:-3]))
#!/usr/bin/env python3
class Publication:
"""
Represents a Publications
"""
def __init__(self, doi_url: str, title: str
, contributors: str, journal: str
, publication_date: str, subjects: list[str], num_citations: int = None
, references: list[any] = None, citations: list[any] = None ):
"""
Parameters
----------
:param doi_url: doi_url of the publication
:type doi_url: str
:param title: title of the publication
:type title: str
:param contributors:list of all contributors
:type contributors: list[]
:param published: date of release
:type published: str
:param subjects: the subject of the Publication
:type subjects: list[str]
:param references: the Citation which is been referenced by this Publication
:type references: list[any]
:param citations: the Citation which references this Publication
:type citations: list[any]
:return: None
"""
self.doi_url = doi_url
self.title = title
self.contributors = contributors
self.journal = journal
self.publication_date = publication_date
self.subjects = subjects
if references is None:
self.references = []
else:
self.references = references
if citations is None:
self.citations = []
else:
self.citations = citations
if num_citations is None:
self.num_citations = len(self.citations)
else:
self.num_citations = num_citations # braucht man nicht einfach len(citations)
def __str__(self) -> str:
return ("Title: {}\n"
"Doi-url: {}\n"
"Authors: {}\n"
"Journal: {}\n"
"Published on: {}\n"
"Subjects: {}\n"
"References: \n{}\n"
"Citations: \n{}\n")\
.format(self.title, self.doi_url, ", ".join(self.contributors)
, self.journal, self.publication_date
, ", ".join(self.subjects)
, "\n".join(self.get_citation_string(self.references))
, "\n".join(self.get_citation_string(self.citations)))
@staticmethod
def get_citation_string(citations):
if citations == []:
return ["None"]
else:
citation_string = []
for citation in citations:
citation_string.append(citation.__str__())
return citation_string
def citations(self, citation) -> None:
"""
Appends a list of Citations or Citation to self.citations.
Parameter
---------
:param citation: Citation or Reference of the Publication
:type citation: Citation or list[Citation]
:return: self.citations
"""
if type(citation) is Citation:
self.citations.append(citation)
# Checks if 'citation' is a list of Citations
elif type(citation) is list:
for _cit in citation:
if type(_cit) is Citation:
self.citations.append(_cit)
else:
raise TypeError("_set_citation expects Citations or List of Citations, not: '{}'"
.format(type(_cit)))
else:
raise TypeError("_set_citation expects Citations or List of Citations, not: '{}'"
.format(type(citation)))
return self.citations
def __eq__(self, other) -> bool:
""" Compares the unique doi_url of two Publications"""
return self.doi_url == other.doi_url
def print_pub(self):
print(f'''Article title: {self.title}
Publication date: {self.publication_date}
DOI-URL: {self.doi_url}
Subjects:''')
print(*(self.subjects), sep = ", ")
print('\nContributors:')
print(*(self.contributors), sep = ", ")
if int(self.num_citations) > 0:
if int(self.num_citations) == 1:
print(f'\nThis publication is cited by the following publication:\n')
else:
print(f'\nThis publication is cited by the following {self.num_citations} publications:\n')
for citation in self.citations:
print(f'''
Title: {citation.title}
Journal: {citation.journal}
Contributors: {citation.contributors}
DOI-URL: {citation.doi_url}
''')
else:
print('\nThis publication is not cited by any other publication.')
class Citation:
def __init__(self, doi_url: str, title: str
, journal: str, contributors: list[str]
, cit_type: str = "Citation"):
"""
Parameters
----------
:param doi_url: doi_url of the publication
:type doi_url: str
:param title: title of the publication
:type title: str
:param contributors: list of all contributors
:type contributors: list[str]
:param cit_type: Specifies if Reference or Citation
:type cit_type: str
:return: None
"""
self.title = title
self.doi_url = doi_url
self.journal = journal
self.contributors = contributors
self.cit_type = cit_type
def __str__(self) -> str:
return ("\t{}-Title: {}\n"
"\t{}-Doi: {}\n"
"\t{}-Journal: {}\n"
"\t{}-Contributors: {}\n")\
.format(self.cit_type, self.title
, self.cit_type, self.doi_url
, self.cit_type, self.journal
, self.cit_type, ", ".join(self.contributors))
# This is just a replica of Citations
class Reference:
def __init__(self, doi_url: str, title: str, journal: str, contributors: list[str]):
self.title = title
self.doi_url = doi_url
self.journal = journal
self.contributors = contributors
def __str__(self) -> str:
return ("\tReferences-Title: {}\n"
"\tReferences-Doi: {}\n"
"\tReferences-Journal: {}\n"
"\tReferences-Contributors: {}")\
.format(self.title, self.doi_url
, self.journal, ", ".join(self.contributors))
beautifulsoup4
requests
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment