Skip to content
Snippets Groups Projects
Commit 9dc8680e authored by Malte Schokolowski's avatar Malte Schokolowski
Browse files

fixed merge conflict

parents 48276712 30836634
No related branches found
No related tags found
1 merge request!7Main
with 1159 additions and 2 deletions
#!/usr/bin/env python3
from input.interface import InputInterface as Input
def count_journals(url: str):
inter = Input()
pub = inter.get_publication(url)
if pub.citations:
for citation in pub.citations:
journal = citation.journal
if journal in cit:
cit[journal] += 1
cit[journal] = 1
if pub.references:
for reference in pub.references:
journal = reference.journal
if journal in cit:
cit[journal] += 1
cit[journal] = 1
if __name__ == "__main__":
cit = {}
cit = dict(sorted(cit.items(), key=lambda item: item[1]))
for journal in cit:
if journal != "":
print(f'{journal}: {cit[journal]}')
#!/usr/bin/env python3
from input.interface import InputInterface as Input
def main(url: str):
i = Input()
# print(i.get_supported_fetchers()) Useless because all classes are called the same
if __name__ == "__main__":
# Projekt CiS-Projekt 2021/22 # Projekt CiS-Projekt 2021/22
Input-Package to fetch publication information with a given url.
## Usage/Examples
from input.interface import InputInterface as Input
from input.publication import Publication
def main(url):
inter = Input()
pub = inter.get_publication(url)
except Exception as error:
raise error
pub.title = "Cool new Title"
if __name__ == "__main__":
The expected results of calling this methode are:
| Input-Url | Result |
| supported & correct| A publication Instance |
| supported & uncorrect| ValueError|
| not supported | ValueError|
Supported Url are urls, which comply with the url-pattern of supported Journals.
### Supported Journals:
- ACS-Journals
- (Nature-Journals)
## Testing
``` c
python -m unittest input/test/<> -v
# for all tests in directory
python -m unittest discover input/test -v
## Authors
- Florian Jochens
- Sam Ockenden
- Julius Schenk
\ No newline at end of file
#!/usr/bin/env python3
Child class of JournalFetcher
Usage: Check if Url can be used with 'can_use_url'
and then fetch publication with 'get_publication'
import re
from input.get.journal_fetcher import JournalFetcher
from input.publication import Publication, Citation
class Fetcher(JournalFetcher):
Specific Fetcher for the ACS journals.
# Constant for the abbreviations of the supported Journals
def can_use_url(url: str) -> str:
Uses Regex to extract journal specific substrings in Doi.
TODO: Support non Doi-urls
matched_url = re.match(r'^(https?://)?(|\d{4})/\w+.\S+)', url.strip(". \t\r\n"))
#Checks if match exists
if matched_url is not None:
return matched_url[4] in Fetcher.SUPPORTED_JOURNALS
return False
def get_pub_light(url: str) -> Publication:
Fetches html and creates Beatifulsoup-instance in parent class.
Specific css-searches for ACS-Journals and creates Publication-instance.
# Creation of Soup
soup = JournalFetcher.get_soup(url)
except Exception as error:
raise error
# Raise Error if re recognizes Pattern, but url isnt correct:
# For other Urls
if soup.text.strip(" \t\n")=="Missing resource null":
raise ValueError("'{}' matches Pattern for 'ACS', but doesnt link to Paper.".format(url))
# For Dois
if soup.title is not None:
if soup.title.text == "Error: DOI Not Found":
raise ValueError("'{}' matches Pattern for 'ACS', but doesnt link to Paper.".format(url))
soup_header ='.article_header')[0]
# Creates Publication
doi_url ='a[title="DOI URL"]')[0].string
title =".hlFld-Title")[0].text
contributors = []
for author in".hlFld-ContribAuthor"):
journal =".cit-title")[0].text
# Replaces abbreviation with whole name
if journal in JournalFetcher.abbrev_dict:
journal = JournalFetcher.abbrev_dict[journal]
published =".pub-date-value")[0].text
subjects = []
subject_soup ='.article_header-taxonomy')[0]
for subject in'a'):
return Publication(doi_url, title, contributors, journal, published,
def get_publication(url: str) -> Publication:
Fetches html and creates Beatifulsoup-instance in parent class.
Specific css-searches for ACS-Journals and creates Publication-instance.
# Creation of Soup
soup = JournalFetcher.get_soup(url)
except Exception as error:
raise error
# Raise Error if re recognizes Pattern, but url isnt correct:
# For other Urls
if soup.text.strip(" \t\n")=="Missing resource null":
raise ValueError("'{}' matches Pattern for 'ACS', but doesnt link to Paper.".format(url))
# For Dois
if soup.title is not None:
if soup.title.text == "Error: DOI Not Found":
raise ValueError("'{}' matches Pattern for 'ACS', but doesnt link to Paper.".format(url))
soup_header ='.article_header')[0]
#Could be used for more specific search
ref_cit_soup = soup
# Creates Publication
doi_url ='a[title="DOI URL"]')[0].string
title =".hlFld-Title")[0].text
contributors = []
for author in".hlFld-ContribAuthor"):
journal =".cit-title")[0].text
# Replaces abbreviation with whole name
if journal in JournalFetcher.abbrev_dict:
journal = JournalFetcher.abbrev_dict[journal]
published =".pub-date-value")[0].text
subjects = []
subject_soup ='.article_header-taxonomy')[0]
for subject in'a'):
references = []
references_soup ='ol#references')
if references_soup != []:
for reference in references_soup[0].select('li'):
if'.refDoi') != []:
ref_doi = "{}".format('.refDoi')[0].text.strip()[5:])
# No Doi -> No Paper
ref_title ='.NLM_article-title')[0].text\
if'.NLM_article-title') != [] else None
ref_journal ='i')[0].text\
if'i') != [] else None
# Replaces abbreviation with whole name
if ref_journal in JournalFetcher.abbrev_dict:
ref_journal = JournalFetcher.abbrev_dict[ref_journal]
for author in'.NLM_contrib-group'):
ref_contributors.append(author.text.replace("\n", " ").replace("\r", ""))
references.append(Citation(ref_doi, ref_title, ref_journal, ref_contributors, cit_type="Reference"))
citations = []
citation_soup ='.cited-content_cbyCitation')
if citation_soup != []:
for citation in citation_soup[0].select('li'):
if'a[title="DOI URL"]') != []:
cit_doi ='a[title="DOI URL"]')[0].text
# No Doi -> No Paper
cit_title ='.cited-content_cbyCitation_article-title')[0].text\
if'.cited-content_cbyCitation_article-title')!= [] else None
cit_journal ='.cited-content_cbyCitation_journal-name')[0].text\
if'.cited-content_cbyCitation_journal-name') != [] else None
# Replaces abbreviation with whole name
if cit_journal in JournalFetcher.abbrev_dict:
cit_journal = JournalFetcher.abbrev_dict[cit_journal]
cit_contributors =[]
cit_contributors ='.cited-content_cbyCitation_article-contributors')[0]\
.text.replace("\n", " ").replace("\r", "").split(', ')
# clean up of the last Entry
cit_contributors_last = cit_contributors.pop().strip(". ")
if cit_contributors_last != '':
citations.append(Citation(cit_doi, cit_title, cit_journal, cit_contributors, cit_type = "Citation"))
return Publication(doi_url, title, contributors, journal, published
, subjects, references, citations)
#!/usr/bin/env python3
Parent class for specific Journal
from abc import ABCMeta, abstractmethod
from bs4 import BeautifulSoup
import requests
from input.publication import Publication
class JournalFetcher(metaclass=ABCMeta):
This is a abstract-class for fetcher modules
def get_soup(url: str) -> BeautifulSoup:
Retrieves webside-html and returns a BeautifulSoup-instance
:type url: str
:param url: doi-url to a publication
:return: BeatifulSoup-instance
req = requests.get(url)
except requests.exceptions.HTTPError as err:
raise SystemExit(err)
return BeautifulSoup(req.content, 'html.parser')
def can_use_url(url: str) -> bool:
Abstract-function to be implemented in subclass.
Checks if given url links to a supported journal
raise AttributeError("JournalFetcher for '{}' hasnt implemented 'can_use_url()'".format(url))
def get_publication(url: str) -> Publication:
Abstract-function to be implemented in subclass.
Creates a Publication-instance.
raise AttributeError("JournalFetcher for '{}' hasnt implemented 'get_publication()'".format(url))
# A Dictionary, which connects abbreviation to whole journal-name
abbrev_dict = {
"Nat. Protoc.":"Journal of Natural Products"
,"PLoS Comput. Biol.":"PLoS Computational Biology"
,"PLoS One":"PLoS One"
,"Protein Sci.":"Protein Science"
,"J. Am. Chem. Soc.":"Journal of the American Chemical Society"
,"J. Chem. Phys.":"Journal of Chemical Physics"
,"Appl. Sci.":"Applied Science"
,"Comput. Sci. Eng.":"Computing in Science & Engineering"
,"Beilstein J. Org. Chem.":"Beilstein Journal of Organic Chemistry"
,"Biol. Chem.":"Biological Chemistry"
,"Isr. J. Chem.":"Israel Journal of Chemistry"
,"Nat. Methods":"Nature Methods"
,"Proc. Natl. Acad. Sci. U. S. A.":"Proceedings of the National Academy of Sciences of the United States of America"
,"J. Phys. Chem. B":"Journal of Physical Chemistry B"
,"Carbohydr. Res.":"Carbohydrate Research"
,"J. Chem. Theory Comput.":"Journal of Chemical Theory and Computation"
,"J. Mol. Biol.":"Journal of Molecular Biology"
,"Nucleic Acids Res.":"Nucleic Acids Research"
,"J. Comput. Chem.":"Journal of Computational Chemistry"
,"J. Cheminf.":"Journal of Cheminformatics"
,"J. Med. Chem.":"Journal of Medicinal Chemistry"
,"J. Comput.-Aided Mol. Des.":"Journal of Computer-Aided Molecular Design"
,"J. Chem. Inf. Model.":"Journal of Chemical Information and Modeling"
,"Mol. Cell":"Molecular Cell"
,"J. Cell Biolog.":"Journal of Cell Biology"
,"Mol. Cell Biol.":"Molecular and Cellular Biology"
,"J. Cell Sci.":"Journal of Cell Science"
,"Nat. Cell Biol.":"Nature Cell Biology"
,"J. Aerosol Sci. Technol.":"Aerosol Science and Technology"
,"Mol. Biol. Cell":"Molecular Biology of the Cell"
,"Build. Environ.":"Building and Environment"
,"Sci. Rep.":"Scientific Reports"
,"Nat. Chem.":"Nature Chemistry"
,"Nat. Med.":"Nature Medicine"
,"Nat. Commun.":"Nature Communications"
,"Exp. Cell Res.":"Experimental Cell Research"
,"Nat. Chem. Biol.":"Nature Chemical Biology"
\ No newline at end of file
#!/usr/bin/env python3
Child class of JournalFetcher
Usage: Check if Url can be used with 'can_use_url'
and then fetch publication with 'get_publication'
# import re
from input.get.journal_fetcher import JournalFetcher
from input.publication import Publication
class Fetcher(JournalFetcher):
scrapes publication metadata from a provided url
# TODO: List of Compatable Journals
# NOTE: nature does not use journal names in doi links, must match by 10.xxxx identifier instead
def can_use_url(url: str) -> bool:
Checks if given url links to a supported journal.
# TODO: Check the URL for compatability
return False
def get_publication(url: str) -> Publication:
Creates a Publication-instance.
soup = JournalFetcher.get_soup(url)
_doi_url = "" + soup.head.find(attrs={"name": "DOI"}).get("content")
_title = soup.head.find(attrs={"name": "citation_title"}).get("content")
_journal = soup.head.find(attrs={"name": "citation_journal_title"}).get("content")
_published = soup.head.find(attrs={"name": "prism.publicationDate"}).get("content")
_contributors = []
_subjects = []
for creator in soup.head.findAll(attrs={"name": "dc.creator"}):
for subject in soup.head.findAll(attrs={"name": "dc.subject"}):
return Publication(_doi_url, _title, _contributors, _journal, _published, _subjects)
# TODO: Exceptions-handling
# raise ValueException("Cant Fetch: '{}'".format(error))
# return None
#!/usr/bin/env python3
Child class of JournalFetcher
Usage: None, this is just a template and should be ignored
# import re
from input.get.journal_fetcher import JournalFetcher
from input.publication import Publication
class Fetcher(JournalFetcher):
This is only a template and therefore has no functionality
# TODO: Naming-Convention:
# Class: 'Fetcher'
# file: [journal-/organisation-name]
# format = "[a-z]*.py" allowed
# TODO: List of Compatable Journals
def can_use_url(url: str) -> bool:
Checks if given url links to a supported journal.
# TODO: Check the URL for compatability
# url_re = re.match(r'(https?://)?(\d{4})/\w+.\S+)', url)
# if url_re is not None:
# return url_re[4] in SUPPORTED_JOURNALS
# else:
return False
def get_publication(url: str) -> Publication:
Creates a Publication-instance.
# TODO: Fetch data from the HTML
# soup = JournalFetcher.get_soup(url)
# doi,title,contributors[],journal,publication_date,subjects[],references[],citations[]
# TODO: Create new Publication-instance
# return Publication(doi_url, title, contributors = [], journal
# , publication_date, subjects = [], references = [], citations = [])
return None
\ No newline at end of file
#!/usr/bin/env python3
Interface for the Input-Package only this should be accessed from outside this Package.
from os import walk
import importlib
import pathlib
import re
from input.publication import Publication
class InputInterface:
Singleton which dynamically imports and manages fetchers
instance = None
get_path = None
# '__new__' is called before '__init__' and gives us an instance
def __new__(cls, *args, **kwargs):
# checks if an instance exists and if it doesnt creates one
if cls.instance == None:
cls.instance = super(InputInterface, cls).__new__(cls,*args, **kwargs)
return cls.instance
def __init__(self):
# imports all modules
if self.fetcher_classes ==[]:
if self.fetcher_classes ==[]:
raise AttributeError("No specific Fetchers where found at: '{}'"
def get_publication(self, url: str) -> Publication:
The interface-method to get a Publication-instance
(including it's citations and references)
:param url: url to a Publication
:type url: str
:return: Publication instance or None if not supported
# Checks if module supports the 'url' and
# returns a Publication if it does.
for fetcher_class in InputInterface.fetcher_classes:
if fetcher_class.can_use_url(url):
return fetcher_class.get_publication(url)
# No Module for given url was found
raise ValueError("'{}' is not supported".format(url))
def get_pub_light(self, url: str) -> Publication:
The interface-method to get a Publication-instance
(only for main article)
:param url: url to a Publication
:type url: str
:return: Publication instance or None if not supported
# Checks if module supports the 'url' and
# returns a Publication if it does.
for fetcher_class in InputInterface.fetcher_classes:
if fetcher_class.can_use_url(url):
return fetcher_class.get_pub_light(url)
# No Module for given url was found
raise ValueError("'{}' is not supported".format(url))
def get_supported_fetchers(self):
# print(self.fetcher_classes[0].__name__) Useless right now,
# because all classes are called the same
return [a.__name__ for a in self.fetcher_classes]
def import_fetcher_classes(self):
Searches in 'get', if there are [a-z]*.py modules (specific Fetchers)
and tries to import them.
Saves found modules in 'fetcher_files'.
# Path to 'get'-package
self.get_path = '{}/get'.format(pathlib.Path(__file__).parent.resolve())
# Searches for modules with given Pattern
for file in next(walk(self.get_path), (None, None, []))[2]:
if re.match(r'[a-z]', file) is not None:
# Tries to import those modules and saves their 'Fetcher'-class
for file in fetcher_file_names:
fetcher_class = importlib.import_module("input.get.{}".format(file[:-3]))
except Exception as error:
ImportError("Module '{}' does not have a 'Fetcher'-class".format(file[:-3]))
except Exception:
raise ImportError("Module '{}' can not be imported".format(file[:-3]))
#!/usr/bin/env python3
# this is needed for typing pre python 3.9, this maybe as an large Overhead
from typing import Any, List
class Publication:
Represents a Publications
def __init__(self, doi_url: str, title: str \
, contributors: List[str], journal: str \
, publication_date: str, subjects: List[str]\
, references: List[Any] = None, citations: List[Any] = None ):
:param doi_url: doi_url of the publication
:type doi_url: str
:param title: title of the publication
:type title: str
:param contributors:list of all contributors
:type contributors: list[]
:param published: date of release
:type published: str
:param subjects: the subject of the Publication
:type subjects: List[str]
:param references: the Citation which is been referenced by this Publication
:type references: List[Any]
:param citations: the Citation which references this Publication
:type citations: List[Any]
:return: None
self.doi_url = doi_url
self.title = title
self.contributors = contributors
self.journal = journal
self.publication_date = publication_date
self.subjects = subjects
if references is None:
self.references = []
self.references = references
if citations is None:
self.citations = []
self.citations = citations
# For the 'Verarbeitungsgruppe' = None
def __str__(self) -> str:
return ("Title: {}\n"
"Doi-url: {}\n"
"Authors: {}\n"
"Journal: {}\n"
"Published on: {}\n"
"Subjects: {}\n"
"References: \n{}\n"
"Citations: \n{}")\
.format(self.title, self.doi_url, ", ".join(self.contributors)
, self.journal, self.publication_date
, ", ".join(self.subjects)
, "\n".join(self.get_citation_string(self.references))
, "\n".join(self.get_citation_string(self.citations)))
def get_citation_string(citations):
if citations == []:
return ["None"]
citation_string = []
for citation in citations:
return citation_string
def add_citations(self, citation) -> None:
Appends a list of Citations or Citation to self.citations.
:param citation: Citation or Reference of the Publication
:type citation: Citation or list[Citation]
:return: self.citations
if type(citation) is Citation:
# Checks if 'citation' is a list of Citations
elif type(citation) is list:
for _cit in citation:
if type(_cit) is Citation:
raise TypeError("_set_citation expects Citations or List of Citations, not: '{}'"
raise TypeError("_set_citation expects Citations or List of Citations, not: '{}'"
return self.citations
def __eq__(self, other) -> bool:
""" Compares the unique doi_url of two Publications"""
if type(self)==type(other):
return self.doi_url == other.doi_url
return False
class Citation:
def __init__(self, doi_url: str, title: str \
, journal: str, contributors: List[str] \
, cit_type: str = "Citation"):
:param doi_url: doi_url of the publication
:type doi_url: str
:param title: title of the publication
:type title: str
:param contributors: list of all contributors
:type contributors: List[str]
:param cit_type: Specifies if Reference or Citation
:type cit_type: str
:return: None
self.title = title
self.doi_url = doi_url
self.journal = journal
self.contributors = contributors
self.cit_type = cit_type
def __str__(self) -> str:
return ("\t{}-Title: {}\n"
"\t{}-Doi: {}\n"
"\t{}-Journal: {}\n"
"\t{}-Contributors: {}\n")\
.format(self.cit_type, self.title
, self.cit_type, self.doi_url
, self.cit_type, self.journal
, self.cit_type, ", ".join(self.contributors))
\ No newline at end of file
This diff is collapsed.
import unittest
from input.get.journal_fetcher import JournalFetcher
from input.interface import InputInterface
from input.publication import Publication
Testing the Publication fetcher
Publication 1: ''
Publication 2: ''
Publication 3: '10.1038/nchem.1781'
Publication 4: '11.12/jaj'
Publication 5: '11.12/'
Publication 6: '' # Paper is a PDF
# TODO: Testcases for:
# - Specific Journals: Inherit from FetcherTestCase
# - interface module-importer (test case)
# - Error detection
# - wrong/no Journal_fetchers
# - wrong urls
# - correct Types in publication
# - Edgecases (i.e. paper as pdf, no connection, etc)
class InterfaceTestCase(unittest.TestCase):
def setUp(self):
self.assertEqual(InputInterface.instance, None)
self.interface = InputInterface()
def test_singleton(self):
# interface should already be made in setUp()
self.assertNotEqual(self.interface.instance, None)
new_interface = InputInterface()
self.assertEqual(self.interface, new_interface)
# def test_imported_modules(self):
# fetchers = self.interface.get_supported_fetchers
class FetcherTestCase(unittest.TestCase):
def can_use_url_test(self, fetcher : JournalFetcher, test_url: str, expected_res: bool):
# Tests the 'can_use_url'-method
self.assertEqual(fetcher.can_use_url(test_url), expected_res)
def get_publication_test(self, fetcher : JournalFetcher, test_url: str, expected_res: Publication):
this test asserts that every variable is equals to the expected result
actual_res = fetcher.get_publication(test_url)
self.assertEqual(actual_res.doi_url, expected_res.doi_url)
self.assertEqual(actual_res.title, expected_res.title)
self.assertEqual(actual_res.contributors, expected_res.contributors)
self.assertEqual(actual_res.journal, expected_res.journal)
self.assertEqual(actual_res.publication_date, expected_res.publication_date)
self.assertEqual(actual_res.subjects, expected_res.subjects)
# Checking for all references
self.assertEqual(len(actual_res.references), len(expected_res.references))
num_references = len(expected_res.references)
for i in range(num_references):
self.assertEqual(actual_res.references[i].doi_url, expected_res.references[i].doi_url)
self.assertEqual(actual_res.references[i].journal, expected_res.references[i].journal)
self.assertEqual(actual_res.references[i].contributors, expected_res.references[i].contributors)
self.assertEqual(actual_res.references[i].cit_type, expected_res.references[i].cit_type)
# Checking for all citations
self.assertEqual(len(actual_res.citations), len(expected_res.citations))
num_citations = len(expected_res.citations)
for i in range(num_citations):
self.assertEqual(actual_res.citations[i].doi_url, expected_res.citations[i].doi_url)
self.assertEqual(actual_res.citations[i].journal, expected_res.citations[i].journal)
self.assertEqual(actual_res.citations[i].contributors, expected_res.citations[i].contributors)
self.assertEqual(actual_res.citations[i].cit_type, expected_res.citations[i].cit_type)
def get_publication_exception_test(self, fetcher: JournalFetcher, test_url: str):
# Ckecks
with self.assertRaises(ValueError):
\ No newline at end of file
# Projekt CiS-Projekt 2021/22
File added
...@@ -21,9 +21,10 @@ from pathlib import Path ...@@ -21,9 +21,10 @@ from pathlib import Path
class Publication: class Publication:
#_registry = [] #_registry = []
_citations = [] _citations = []
_references = []
def __init__(self, title, publication_date, contributors, doi_url, def __init__(self, title, publication_date, contributors, doi_url,
subjects, num_citations): subjects = None, num_citations = None):
#self._registry.append(self) #self._registry.append(self)
self.title = title self.title = title
self.publication_date = publication_date self.publication_date = publication_date
...@@ -31,6 +32,8 @@ class Publication: ...@@ -31,6 +32,8 @@ class Publication:
self.doi_url = doi_url self.doi_url = doi_url
self.subjects = subjects self.subjects = subjects
self.num_citations = num_citations self.num_citations = num_citations
#self._citations = []
#self._references = []
class Citation: class Citation:
def __init__(self, title, journal, contributors, doi_url): def __init__(self, title, journal, contributors, doi_url):
...@@ -39,6 +42,13 @@ class Citation: ...@@ -39,6 +42,13 @@ class Citation:
self.contributors = contributors self.contributors = contributors
self.doi_url = doi_url self.doi_url = doi_url
class References:
def __init__(self, title, journal, contributors, doi_url):
self.title = title
self.journal = journal
self.contributors = contributors
self.doi_url = doi_url
def get_article_info(soup): def get_article_info(soup):
header = soup.find('div', class_ = 'article_header-left pull-left') header = soup.find('div', class_ = 'article_header-left pull-left')
article_title = header.find('span', class_ = 'hlFld-Title').text article_title = header.find('span', class_ = 'hlFld-Title').text
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment