Skip to content
Snippets Groups Projects
Commit 85ece1d0 authored by Stahl, Merle's avatar Stahl, Merle
Browse files

Merge remote-tracking branch 'upstream/main' into main

parents 6594b1a7 95e73bd7
No related branches found
No related tags found
2 merge requests!10Output,!9Main
This commit is part of merge request !9. Comments created here will be created in the context of that merge request.
Showing
with 1304 additions and 0 deletions
**/__pycache__/
#!/usr/bin/env python3
from input.interface import InputInterface as Input
def count_journals(url: str):
inter = Input()
pub = inter.get_publication(url)
if pub.citations:
for citation in pub.citations:
journal = citation.journal
if journal in cit:
cit[journal] += 1
else:
cit[journal] = 1
if pub.references:
for reference in pub.references:
journal = reference.journal
if journal in cit:
cit[journal] += 1
else:
cit[journal] = 1
if __name__ == "__main__":
cit = {}
count_journals("https://doi.org/10.1021/acs.jcim.1c00203")
count_journals("https://doi.org/10.1021/acs.jcim.6b00561")
count_journals("https://doi.org/10.1021/acs.jcim.6b00613")
count_journals("https://doi.org/10.1021/acs.jcim.1c00917")
count_journals("https://doi.org/10.1021/acs.jmedchem.0c01332")
#count_journals("https://pubs.acs.org/doi/10.1021/acs.biochem.1c00290")
#count_journals("https://pubs.acs.org/doi/10.1021/acsenvironau.1c00007")
#count_journals("https://pubs.acs.org/doi/10.1021/acs.biochem.7b01162")
cit = dict(sorted(cit.items(), key=lambda item: item[1]))
for journal in cit:
if journal != "":
print(f'{journal}: {cit[journal]}')
#!/usr/bin/env python3
from input.interface import InputInterface as Input
def main(url: str):
i = Input()
#print(i.get_publication(url))
print(i.get_pub_light(url))
# print(i.get_supported_fetchers()) Useless because all classes are called the same
if __name__ == "__main__":
#main("https://doi.org/10.1021/acs.jcim.1c0023")
main("https://doi.org/10.1021/acs.jcim.5b00332")
# Projekt CiS-Projekt 2021/22
Input-Package to fetch publication information with a given url.
## Usage/Examples
```python
from input.interface import InputInterface as Input
from input.publication import Publication
def main(url):
inter = Input()
try:
pub = inter.get_publication(url)
except Exception as error:
raise error
print(pub)
pub.title = "Cool new Title"
print(pub)
if __name__ == "__main__":
main("https://doi.org/10.1021/acs.chemrev.8b00728")
```
The expected results of calling this methode are:
| Input-Url | Result |
|-----------|-----------|
| supported & correct| A publication Instance |
| supported & uncorrect| ValueError|
| not supported | ValueError|
Supported Url are urls, which comply with the url-pattern of supported Journals.
### Supported Journals:
- ACS-Journals
- (Nature-Journals)
## Testing
``` c
python -m unittest input/test/<file.py> -v
# for all tests in directory
python -m unittest discover input/test -v
```
## Authors
- Florian Jochens
- Sam Ockenden
- Julius Schenk
\ No newline at end of file
#!/usr/bin/env python3
"""
Child class of JournalFetcher
Usage: Check if Url can be used with 'can_use_url'
and then fetch publication with 'get_publication'
"""
import re
from input.get.journal_fetcher import JournalFetcher
from input.publication import Publication, Citation
class Fetcher(JournalFetcher):
"""
Specific Fetcher for the ACS journals.
"""
# Constant for the abbreviations of the supported Journals
SUPPORTED_JOURNALS = ['1021']
@staticmethod
def can_use_url(url: str) -> str:
"""
Uses Regex to extract journal specific substrings in Doi.
TODO: Support non Doi-urls
"""
matched_url = re.match(r'^(https?://)?(doi.org/|pubs.acs.org/doi/)?(10.(\d{4})/\w+.\S+)', url.strip(". \t\r\n"))
#Checks if match exists
if matched_url is not None:
return matched_url[4] in Fetcher.SUPPORTED_JOURNALS
else:
return False
@staticmethod
def get_pub_light(url: str) -> Publication:
"""
Fetches html and creates Beatifulsoup-instance in parent class.
Specific css-searches for ACS-Journals and creates Publication-instance.
"""
# Creation of Soup
try:
soup = JournalFetcher.get_soup(url)
except Exception as error:
raise error
# Raise Error if re recognizes Pattern, but url isnt correct:
# For other Urls
if soup.text.strip(" \t\n")=="Missing resource null":
raise ValueError("'{}' matches Pattern for 'ACS', but doesnt link to Paper.".format(url))
# For Dois
if soup.title is not None:
if soup.title.text == "Error: DOI Not Found":
raise ValueError("'{}' matches Pattern for 'ACS', but doesnt link to Paper.".format(url))
soup_header = soup.select('.article_header')[0]
# Creates Publication
doi_url = soup_header.select('a[title="DOI URL"]')[0].string
title = soup_header.select(".hlFld-Title")[0].text
contributors = []
for author in soup_header.select(".hlFld-ContribAuthor"):
contributors.append(author.text)
journal = soup_header.select(".cit-title")[0].text
# Replaces abbreviation with whole name
if journal in JournalFetcher.abbrev_dict:
journal = JournalFetcher.abbrev_dict[journal]
published = soup_header.select(".pub-date-value")[0].text
subjects = []
subject_soup = soup_header.select('.article_header-taxonomy')[0]
for subject in subject_soup.select('a'):
subjects.append(subject.text)
return Publication(doi_url, title, contributors, journal, published,
subjects)
def get_publication(url: str) -> Publication:
"""
Fetches html and creates Beatifulsoup-instance in parent class.
Specific css-searches for ACS-Journals and creates Publication-instance.
"""
# Creation of Soup
try:
soup = JournalFetcher.get_soup(url)
except Exception as error:
raise error
# Raise Error if re recognizes Pattern, but url isnt correct:
# For other Urls
if soup.text.strip(" \t\n")=="Missing resource null":
raise ValueError("'{}' matches Pattern for 'ACS', but doesnt link to Paper.".format(url))
# For Dois
if soup.title is not None:
if soup.title.text == "Error: DOI Not Found":
raise ValueError("'{}' matches Pattern for 'ACS', but doesnt link to Paper.".format(url))
soup_header = soup.select('.article_header')[0]
#Could be used for more specific search
ref_cit_soup = soup
# Creates Publication
doi_url = soup_header.select('a[title="DOI URL"]')[0].string
title = soup_header.select(".hlFld-Title")[0].text
contributors = []
for author in soup_header.select(".hlFld-ContribAuthor"):
contributors.append(author.text)
journal = soup_header.select(".cit-title")[0].text
# Replaces abbreviation with whole name
if journal in JournalFetcher.abbrev_dict:
journal = JournalFetcher.abbrev_dict[journal]
published = soup_header.select(".pub-date-value")[0].text
subjects = []
subject_soup = soup_header.select('.article_header-taxonomy')[0]
for subject in subject_soup.select('a'):
subjects.append(subject.text)
references = []
references_soup = ref_cit_soup.select('ol#references')
if references_soup != []:
for reference in references_soup[0].select('li'):
if reference.select('.refDoi') != []:
ref_doi = "https://doi.org/{}".format(reference.select('.refDoi')[0].text.strip()[5:])
else:
# No Doi -> No Paper
continue
ref_title = reference.select('.NLM_article-title')[0].text\
if reference.select('.NLM_article-title') != [] else None
ref_journal = reference.select('i')[0].text\
if reference.select('i') != [] else None
# Replaces abbreviation with whole name
if ref_journal in JournalFetcher.abbrev_dict:
ref_journal = JournalFetcher.abbrev_dict[ref_journal]
ref_contributors=[]
for author in reference.select('.NLM_contrib-group'):
ref_contributors.append(author.text.replace("\n", " ").replace("\r", ""))
references.append(Citation(ref_doi, ref_title, ref_journal, ref_contributors, cit_type="Reference"))
citations = []
citation_soup = ref_cit_soup.select('.cited-content_cbyCitation')
if citation_soup != []:
for citation in citation_soup[0].select('li'):
if citation.select('a[title="DOI URL"]') != []:
cit_doi = citation.select('a[title="DOI URL"]')[0].text
else:
# No Doi -> No Paper
continue
cit_title = citation.select('.cited-content_cbyCitation_article-title')[0].text\
if citation.select('.cited-content_cbyCitation_article-title')!= [] else None
cit_journal = citation.select('.cited-content_cbyCitation_journal-name')[0].text\
if citation.select('.cited-content_cbyCitation_journal-name') != [] else None
# Replaces abbreviation with whole name
if cit_journal in JournalFetcher.abbrev_dict:
cit_journal = JournalFetcher.abbrev_dict[cit_journal]
cit_contributors =[]
cit_contributors = citation.select('.cited-content_cbyCitation_article-contributors')[0]\
.text.replace("\n", " ").replace("\r", "").split(', ')
# clean up of the last Entry
cit_contributors_last = cit_contributors.pop().strip(". ")
if cit_contributors_last != '':
cit_contributors.append(cit_contributors_last)
citations.append(Citation(cit_doi, cit_title, cit_journal, cit_contributors, cit_type = "Citation"))
return Publication(doi_url, title, contributors, journal, published
, subjects, references, citations)
#!/usr/bin/env python3
"""
Parent class for specific Journal
"""
from abc import ABCMeta, abstractmethod
from bs4 import BeautifulSoup
import requests
from input.publication import Publication
class JournalFetcher(metaclass=ABCMeta):
"""
This is a abstract-class for fetcher modules
"""
@staticmethod
def get_soup(url: str) -> BeautifulSoup:
"""
Retrieves webside-html and returns a BeautifulSoup-instance
Parameters:
-----------
:type url: str
:param url: doi-url to a publication
:return: BeatifulSoup-instance
"""
try:
req = requests.get(url)
except requests.exceptions.HTTPError as err:
raise SystemExit(err)
return BeautifulSoup(req.content, 'html.parser')
@staticmethod
@abstractmethod
def can_use_url(url: str) -> bool:
"""
Abstract-function to be implemented in subclass.
Checks if given url links to a supported journal
"""
raise AttributeError("JournalFetcher for '{}' hasnt implemented 'can_use_url()'".format(url))
@staticmethod
@abstractmethod
def get_publication(url: str) -> Publication:
"""
Abstract-function to be implemented in subclass.
Creates a Publication-instance.
"""
raise AttributeError("JournalFetcher for '{}' hasnt implemented 'get_publication()'".format(url))
# A Dictionary, which connects abbreviation to whole journal-name
abbrev_dict = {
"Nat. Protoc.":"Journal of Natural Products"
,"PLoS Comput. Biol.":"PLoS Computational Biology"
,"PLoS One":"PLoS One"
,"Protein Sci.":"Protein Science"
,"J. Am. Chem. Soc.":"Journal of the American Chemical Society"
,"J. Chem. Phys.":"Journal of Chemical Physics"
,"Appl. Sci.":"Applied Science"
,"Comput. Sci. Eng.":"Computing in Science & Engineering"
,"Beilstein J. Org. Chem.":"Beilstein Journal of Organic Chemistry"
,"Biol. Chem.":"Biological Chemistry"
,"Isr. J. Chem.":"Israel Journal of Chemistry"
,"Nat. Methods":"Nature Methods"
,"Proc. Natl. Acad. Sci. U. S. A.":"Proceedings of the National Academy of Sciences of the United States of America"
,"J. Phys. Chem. B":"Journal of Physical Chemistry B"
,"Carbohydr. Res.":"Carbohydrate Research"
,"J. Chem. Theory Comput.":"Journal of Chemical Theory and Computation"
,"J. Mol. Biol.":"Journal of Molecular Biology"
,"Nucleic Acids Res.":"Nucleic Acids Research"
,"J. Comput. Chem.":"Journal of Computational Chemistry"
,"J. Cheminf.":"Journal of Cheminformatics"
,"J. Med. Chem.":"Journal of Medicinal Chemistry"
,"J. Comput.-Aided Mol. Des.":"Journal of Computer-Aided Molecular Design"
,"J. Chem. Inf. Model.":"Journal of Chemical Information and Modeling"
,"Mol. Cell":"Molecular Cell"
,"J. Cell Biolog.":"Journal of Cell Biology"
,"Mol. Cell Biol.":"Molecular and Cellular Biology"
,"J. Cell Sci.":"Journal of Cell Science"
,"Nat. Cell Biol.":"Nature Cell Biology"
,"J. Aerosol Sci. Technol.":"Aerosol Science and Technology"
,"Mol. Biol. Cell":"Molecular Biology of the Cell"
,"Build. Environ.":"Building and Environment"
,"Sci. Rep.":"Scientific Reports"
,"Nat. Chem.":"Nature Chemistry"
,"Nat. Med.":"Nature Medicine"
,"Nat. Commun.":"Nature Communications"
,"Exp. Cell Res.":"Experimental Cell Research"
,"Nat. Chem. Biol.":"Nature Chemical Biology"
}
\ No newline at end of file
#!/usr/bin/env python3
"""
Child class of JournalFetcher
Usage: Check if Url can be used with 'can_use_url'
and then fetch publication with 'get_publication'
"""
# import re
from input.get.journal_fetcher import JournalFetcher
from input.publication import Publication
class Fetcher(JournalFetcher):
"""
scrapes publication metadata from a provided url
"""
# TODO: List of Compatable Journals
# NOTE: nature does not use journal names in doi links, must match by 10.xxxx identifier instead
SUPPORTED_JOURNALS = []
@staticmethod
def can_use_url(url: str) -> bool:
"""
Checks if given url links to a supported journal.
"""
# TODO: Check the URL for compatability
# re.match in SUPPORTED_JOURNALS
return False
@staticmethod
def get_publication(url: str) -> Publication:
"""
Creates a Publication-instance.
"""
soup = JournalFetcher.get_soup(url)
_doi_url = "https://doi.org/" + soup.head.find(attrs={"name": "DOI"}).get("content")
_title = soup.head.find(attrs={"name": "citation_title"}).get("content")
_journal = soup.head.find(attrs={"name": "citation_journal_title"}).get("content")
_published = soup.head.find(attrs={"name": "prism.publicationDate"}).get("content")
_contributors = []
_subjects = []
for creator in soup.head.findAll(attrs={"name": "dc.creator"}):
_contributors.append(creator.get("content"))
for subject in soup.head.findAll(attrs={"name": "dc.subject"}):
_subjects.append(subject.get("content"))
return Publication(_doi_url, _title, _contributors, _journal, _published, _subjects)
# TODO: Exceptions-handling
# raise ValueException("Cant Fetch: '{}'".format(error))
# return None
#!/usr/bin/env python3
"""
Child class of JournalFetcher
Usage: None, this is just a template and should be ignored
"""
# import re
from input.get.journal_fetcher import JournalFetcher
from input.publication import Publication
class Fetcher(JournalFetcher):
"""
This is only a template and therefore has no functionality
"""
# TODO: Naming-Convention:
# Class: 'Fetcher'
# file: [journal-/organisation-name]
# format = "[a-z]*.py" allowed
# TODO: List of Compatable Journals
SUPPORTED_JOURNALS = []
@staticmethod
def can_use_url(url: str) -> bool:
"""
Checks if given url links to a supported journal.
"""
# TODO: Check the URL for compatability
# url_re = re.match(r'(https?://)?(doi.org/)?(10.(\d{4})/\w+.\S+)', url)
# if url_re is not None:
# return url_re[4] in SUPPORTED_JOURNALS
# else:
return False
@staticmethod
def get_publication(url: str) -> Publication:
"""
Creates a Publication-instance.
"""
# TODO: Fetch data from the HTML
# soup = JournalFetcher.get_soup(url)
# doi,title,contributors[],journal,publication_date,subjects[],references[],citations[]
# TODO: Create new Publication-instance
# return Publication(doi_url, title, contributors = [], journal
# , publication_date, subjects = [], references = [], citations = [])
return None
\ No newline at end of file
#!/usr/bin/env python3
"""
Interface for the Input-Package only this should be accessed from outside this Package.
"""
from os import walk
import importlib
import pathlib
import re
from input.publication import Publication
class InputInterface:
"""
Singleton which dynamically imports and manages fetchers
"""
instance = None
get_path = None
fetcher_classes=[]
# '__new__' is called before '__init__' and gives us an instance
def __new__(cls, *args, **kwargs):
# checks if an instance exists and if it doesnt creates one
if cls.instance == None:
cls.instance = super(InputInterface, cls).__new__(cls,*args, **kwargs)
return cls.instance
def __init__(self):
# imports all modules
if self.fetcher_classes ==[]:
self.import_fetcher_classes()
if self.fetcher_classes ==[]:
raise AttributeError("No specific Fetchers where found at: '{}'"
.format(self.get_path))
def get_publication(self, url: str) -> Publication:
"""
The interface-method to get a Publication-instance
(including it's citations and references)
Parameters
----------
:param url: url to a Publication
:type url: str
:return: Publication instance or None if not supported
"""
# Checks if module supports the 'url' and
# returns a Publication if it does.
for fetcher_class in InputInterface.fetcher_classes:
if fetcher_class.can_use_url(url):
return fetcher_class.get_publication(url)
# No Module for given url was found
raise ValueError("'{}' is not supported".format(url))
def get_pub_light(self, url: str) -> Publication:
"""
The interface-method to get a Publication-instance
(only for main article)
Parameters
----------
:param url: url to a Publication
:type url: str
:return: Publication instance or None if not supported
"""
# Checks if module supports the 'url' and
# returns a Publication if it does.
for fetcher_class in InputInterface.fetcher_classes:
if fetcher_class.can_use_url(url):
return fetcher_class.get_pub_light(url)
# No Module for given url was found
raise ValueError("'{}' is not supported".format(url))
def get_supported_fetchers(self):
# print(self.fetcher_classes[0].__name__) Useless right now,
# because all classes are called the same
return [a.__name__ for a in self.fetcher_classes]
def import_fetcher_classes(self):
"""
Searches in 'get', if there are [a-z]*.py modules (specific Fetchers)
and tries to import them.
Saves found modules in 'fetcher_files'.
"""
# Path to 'get'-package
self.get_path = '{}/get'.format(pathlib.Path(__file__).parent.resolve())
# Searches for modules with given Pattern
fetcher_file_names=[]
for file in next(walk(self.get_path), (None, None, []))[2]:
if re.match(r'[a-z]+.py', file) is not None:
fetcher_file_names.append(file)
# Tries to import those modules and saves their 'Fetcher'-class
for file in fetcher_file_names:
try:
fetcher_class = importlib.import_module("input.get.{}".format(file[:-3]))
try:
self.fetcher_classes.append(fetcher_class.__getattribute__('Fetcher'))
except Exception as error:
ImportError("Module '{}' does not have a 'Fetcher'-class".format(file[:-3]))
except Exception:
raise ImportError("Module '{}' can not be imported".format(file[:-3]))
#!/usr/bin/env python3
# this is needed for typing pre python 3.9, this maybe as an large Overhead
from typing import Any, List
class Publication:
"""
Represents a Publications
"""
def __init__(self, doi_url: str, title: str \
, contributors: List[str], journal: str \
, publication_date: str, subjects: List[str]\
, references: List[Any] = None, citations: List[Any] = None ):
"""
Parameters
----------
:param doi_url: doi_url of the publication
:type doi_url: str
:param title: title of the publication
:type title: str
:param contributors:list of all contributors
:type contributors: list[]
:param published: date of release
:type published: str
:param subjects: the subject of the Publication
:type subjects: List[str]
:param references: the Citation which is been referenced by this Publication
:type references: List[Any]
:param citations: the Citation which references this Publication
:type citations: List[Any]
:return: None
"""
self.doi_url = doi_url
self.title = title
self.contributors = contributors
self.journal = journal
self.publication_date = publication_date
self.subjects = subjects
if references is None:
self.references = []
else:
self.references = references
if citations is None:
self.citations = []
else:
self.citations = citations
# For the 'Verarbeitungsgruppe'
self.group = None
def __str__(self) -> str:
return ("Title: {}\n"
"Doi-url: {}\n"
"Authors: {}\n"
"Journal: {}\n"
"Published on: {}\n"
"Subjects: {}\n"
"References: \n{}\n"
"Citations: \n{}")\
.format(self.title, self.doi_url, ", ".join(self.contributors)
, self.journal, self.publication_date
, ", ".join(self.subjects)
, "\n".join(self.get_citation_string(self.references))
, "\n".join(self.get_citation_string(self.citations)))
@staticmethod
def get_citation_string(citations):
if citations == []:
return ["None"]
else:
citation_string = []
for citation in citations:
citation_string.append(citation.__str__())
return citation_string
def add_citations(self, citation) -> None:
"""
Appends a list of Citations or Citation to self.citations.
Parameter
---------
:param citation: Citation or Reference of the Publication
:type citation: Citation or list[Citation]
:return: self.citations
"""
if type(citation) is Citation:
self.citations.append(citation)
# Checks if 'citation' is a list of Citations
elif type(citation) is list:
for _cit in citation:
if type(_cit) is Citation:
self.citations.append(_cit)
else:
raise TypeError("_set_citation expects Citations or List of Citations, not: '{}'"
.format(type(_cit)))
else:
raise TypeError("_set_citation expects Citations or List of Citations, not: '{}'"
.format(type(citation)))
return self.citations
def __eq__(self, other) -> bool:
""" Compares the unique doi_url of two Publications"""
if type(self)==type(other):
return self.doi_url == other.doi_url
return False
class Citation:
def __init__(self, doi_url: str, title: str \
, journal: str, contributors: List[str] \
, cit_type: str = "Citation"):
"""
Parameters
----------
:param doi_url: doi_url of the publication
:type doi_url: str
:param title: title of the publication
:type title: str
:param contributors: list of all contributors
:type contributors: List[str]
:param cit_type: Specifies if Reference or Citation
:type cit_type: str
:return: None
"""
self.title = title
self.doi_url = doi_url
self.journal = journal
self.contributors = contributors
self.cit_type = cit_type
def __str__(self) -> str:
return ("\t{}-Title: {}\n"
"\t{}-Doi: {}\n"
"\t{}-Journal: {}\n"
"\t{}-Contributors: {}\n")\
.format(self.cit_type, self.title
, self.cit_type, self.doi_url
, self.cit_type, self.journal
, self.cit_type, ", ".join(self.contributors))
beautifulsoup4
requests
\ No newline at end of file
This diff is collapsed.
import unittest
from input.get.journal_fetcher import JournalFetcher
from input.interface import InputInterface
from input.publication import Publication
"""
Testing the Publication fetcher
Publication 1: 'https://doi.org/10.1021/acs.jcim.1c00203'
Publication 2: 'doi.org/10.1021/acs.jcim.1c00917'
Publication 3: '10.1038/nchem.1781'
Publication 4: '11.12/jaj'
Publication 5: '11.12/'
Publication 6: 'https://doi.org/10.1021/acs.jmedchem.0c01332' # Paper is a PDF
"""
# TODO: Testcases for:
# - Specific Journals: Inherit from FetcherTestCase
# - interface module-importer (test case)
# - Error detection
# - wrong/no Journal_fetchers
# - wrong urls
# - correct Types in publication
# - Edgecases (i.e. paper as pdf, no connection, etc)
class InterfaceTestCase(unittest.TestCase):
def setUp(self):
self.assertEqual(InputInterface.instance, None)
self.interface = InputInterface()
def test_singleton(self):
# interface should already be made in setUp()
self.assertNotEqual(self.interface.instance, None)
new_interface = InputInterface()
self.assertEqual(self.interface, new_interface)
# def test_imported_modules(self):
# fetchers = self.interface.get_supported_fetchers
class FetcherTestCase(unittest.TestCase):
def can_use_url_test(self, fetcher : JournalFetcher, test_url: str, expected_res: bool):
# Tests the 'can_use_url'-method
self.assertEqual(fetcher.can_use_url(test_url), expected_res)
def get_publication_test(self, fetcher : JournalFetcher, test_url: str, expected_res: Publication):
"""
this test asserts that every variable is equals to the expected result
"""
actual_res = fetcher.get_publication(test_url)
self.assertEqual(actual_res.doi_url, expected_res.doi_url)
self.assertEqual(actual_res.title, expected_res.title)
self.assertEqual(actual_res.contributors, expected_res.contributors)
self.assertEqual(actual_res.journal, expected_res.journal)
self.assertEqual(actual_res.publication_date, expected_res.publication_date)
self.assertEqual(actual_res.subjects, expected_res.subjects)
# Checking for all references
self.assertEqual(len(actual_res.references), len(expected_res.references))
num_references = len(expected_res.references)
for i in range(num_references):
self.assertEqual(actual_res.references[i].doi_url, expected_res.references[i].doi_url)
self.assertEqual(actual_res.references[i].journal, expected_res.references[i].journal)
self.assertEqual(actual_res.references[i].contributors, expected_res.references[i].contributors)
self.assertEqual(actual_res.references[i].cit_type, expected_res.references[i].cit_type)
# Checking for all citations
self.assertEqual(len(actual_res.citations), len(expected_res.citations))
num_citations = len(expected_res.citations)
for i in range(num_citations):
self.assertEqual(actual_res.citations[i].doi_url, expected_res.citations[i].doi_url)
self.assertEqual(actual_res.citations[i].journal, expected_res.citations[i].journal)
self.assertEqual(actual_res.citations[i].contributors, expected_res.citations[i].contributors)
self.assertEqual(actual_res.citations[i].cit_type, expected_res.citations[i].cit_type)
def get_publication_exception_test(self, fetcher: JournalFetcher, test_url: str):
# Ckecks
with self.assertRaises(ValueError):
fetcher.get_publication(test_url)
\ No newline at end of file
# Projekt CiS-Projekt 2021/22
Input-Skripts
File added
https://pubs.acs.org/doi/10.1021/acs.jcim.5b00332
https://pubs.acs.org/doi/10.1021/acs.jcim.6b00709
#!/usr/bin/env python3
"""
Functions for information retrieval of articles from the ACS journal JCIM
"""
__author__ = "Florian Jochens"
__email__ = "fj@andaco.de"
__status__ = "Production"
#__copyright__ = ""
#__credits__ = ["", "", "", ""]
#__license__ = ""
#__version__ = ""
#__maintainer__ = ""
from bs4 import BeautifulSoup as bs
import requests as req
import sys
from pathlib import Path
class Publication:
#_registry = []
_citations = []
_references = []
def __init__(self, title, publication_date, contributors, doi_url,
subjects = None, num_citations = None):
#self._registry.append(self)
self.title = title
self.publication_date = publication_date
self.contributors = contributors
self.doi_url = doi_url
self.subjects = subjects
self.num_citations = num_citations
#self._citations = []
#self._references = []
class Citation:
def __init__(self, title, journal, contributors, doi_url):
self.title = title
self.journal = journal
self.contributors = contributors
self.doi_url = doi_url
class References:
def __init__(self, title, journal, contributors, doi_url):
self.title = title
self.journal = journal
self.contributors = contributors
self.doi_url = doi_url
def get_article_info(soup):
header = soup.find('div', class_ = 'article_header-left pull-left')
article_title = header.find('span', class_ = 'hlFld-Title').text
publication_date = header.find('span', class_ = 'pub-date-value').text
for link in header.find('div', class_ = 'article_header-doiurl'):
doi_url = link.get('href')
subs = header.find('div', class_ = 'article_header-taxonomy')
subjects = []
for sub in subs.find_all('a'):
subjects.append(sub.get('title'))
cons = header.find('ul', class_ = 'loa')
contributors = []
for con in cons.find_all('span', class_ = 'hlFld-ContribAuthor'):
contributors.append(con.text)
numc = header.find('div', class_ = 'articleMetrics_count')
if not numc.a:
num_citations = 0
else:
num_citations = numc.a.text
pub = Publication(article_title, publication_date, contributors, doi_url,
subjects, num_citations)
return pub
def get_download_url():
export = soup.find('div', class_ = 'cit-download-dropdown_content')
url = 'https://pubs.acs.org'
for link in export.find_all('a'):
if link.get('title') == 'Citation and references':
url += link.get('href')
print(url)
return url
def download(url): # Download citation and references file
if url.find('='):
filename = url.rsplit('=', 1)[1]
path = Path(('./files/' + filename))
if path.is_file():
print("File already exists")
else:
print("File does not exist")
def get_citation_info(pub, num_citations, soup):
pub._citations = []
details = soup.find('ol', class_ = 'cited-content_cbyCitation')
titles = []
for title in details.find_all('span',
class_ = 'cited-content_cbyCitation_article-title'):
titles.append(title.text.replace('.', ''))
journal_names = []
for name in details.find_all('span',
class_ = 'cited-content_cbyCitation_journal-name'):
journal_names.append(name.text)
doi_urls = []
for url in details.find_all('a'):
doi_urls.append(url.get('href'))
contributors = []
for contrib in details.find_all('span',
class_ = 'cited-content_cbyCitation_article-contributors'):
contributors.append(contrib.text)
for i in range(0, int(num_citations)):
pub._citations.append(Citation(titles[i], journal_names[i],
contributors[i], doi_urls[i]))
def print_pub_info(pub):
print(f'''Article title: {pub.title}
Publication date: {pub.publication_date}
DOI-URL: {pub.doi_url}
Subjects:''')
print(*(pub.subjects), sep = ", ")
print('\nContributors:')
print(*(pub.contributors), sep = ", ")
if int(pub.num_citations) > 0:
if int(pub.num_citations) == 1:
print(f'\nThis publication is cited by the following publication:\n')
else:
print(f'\nThis publication is cited by the following {pub.num_citations} publications:\n')
for citation in pub._citations:
print(f'''
Title: {citation.title}
Journal: {citation.journal}
Contributors: {citation.contributors}
DOI-URL: {citation.doi_url}
''')
else:
print('\nThis publication is not cited by any other publication.')
def input(url):
html_text = req.get(url).text
soup = bs(html_text, 'html.parser')
pub = get_article_info(soup)
if int(pub.num_citations) > 0:
get_citation_info(pub, int(pub.num_citations), soup)
return pub
#if len(sys.argv) != 2:
# sys.stderr.write('Usage: {} <url>\n'.format(sys.argv[0]))
# exit(1)
#url = sys.argv[1]
#pub = input(url)
#print_pub_info(pub)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment