Skip to content
Snippets Groups Projects
Commit 2e5dfde9 authored by Katja's avatar Katja
Browse files

Merge remote-tracking branch 'upstream/main' into main

parents 059e8e0b 5701ab3d
Branches
No related tags found
No related merge requests found
Showing
with 1058 additions and 0 deletions
**/__pycache__/
import base64
import re
import dash
from dash import dcc
from dash import html
from dash import callback_context
from dash.dependencies import Input, Output, State
from dash.exceptions import PreventUpdate
from input.interface import InputInterface
import input.publication
app = dash.Dash(__name__)
# List of options when inputting data and generating the graph
additional_options = ['Update Automatically','Smart Input']
# Reads the contents of info_box.txt.
# They can later be displayed by pressing the corresponding button.
f = open('info_box.txt', 'r')
boxcontent = f.read()
f.close()
app.layout = html.Div([
# Layer 0: For the Header and Help Function(s)
html.Div([
html.Button(id='show-info',children='Show Info',n_clicks=0),
html.Div(id='info-box')
]),
# Layer 1: For all mandatory Inputs
html.Div([
"Input: ",
# A simple box for inputting a string.
# Value is transmitted upon pressing return or clicking out of the box.
dcc.Input(id='input-string', value='', type='text',debounce=True),
# Forward recursion. Values between 1 and 10 can be entered.
dcc.Input(id='forward-depth',value='1',type='number',min='1',max='10'),
# Backward recursion. Values between 1 and 10 can be entered.
dcc.Input(id='backward-depth',value='1',type='number',min='1',max='10'),
# Upload box. Can be used via drag-and-drop or byclicking on it to open a file viewer.
dcc.Upload(
id="upload-data",
children=html.Div(
["Drag and drop or click to select a file to upload."]),
style={
"width": "30%",
"height": "60px",
"lineHeight": "60px",
"borderWidth": "1px",
"borderStyle": "dashed",
"borderRadius": "5px",
"textAlign": "center",
"margin": "10px",
})
]),
# Layer 2: For the checklist, Remove-/Start-Buttons and input-error-message
html.Div([
# All input DOIs are collected in this checklist.
# It is initialized to avoid error messages.
dcc.Checklist(id='input-checklist',options=[],
labelStyle = dict(display='block'),value=[]),
# Displays error message if 'Smart Input' is active.
html.Div(id='input-err',style={'color':'red'}),
# Clears the entire list.
html.Button(id='clear-all-button',children='Clear All'),
# Clear all selected elements.
html.Button(id='clear-selected-button',children='Clear Selected'),
# Starts the process that generates a graph.
html.Button(id='start-button',children='Generate Graph')
]),
# Layer 3: For additional Options (e.g. Topological Sort)
html.Div([
html.H4('Additional Options'),
# A checklist of all additional options that are listed above.
dcc.Checklist(id='additional-options',
options=[{'label':k,'value':k} for k in additional_options],
value=[])
]),
# Layer 4: For the Graph
html.Div([
html.Div(id='test-output')
])
])
@app.callback(
Output('input-checklist','options'),
Output('input-checklist','value'),
Output('input-string','value'),
Output('input-err','children'),
Input('input-string','value'),
Input('clear-all-button','n_clicks'),
Input('clear-selected-button','n_clicks'),
Input('upload-data','contents'),
State('input-checklist','options'),
State('input-checklist','value'),
State('additional-options','value')
)
def update_input_checklist(input_value,btn1,btn2,filecontents,all_inputs,
selected_inputs,additional_options):
'''
Most important callback function. Updates the checklist that holds all inputs.
State of the checklist as input is needed so that previews entries are readded.
input-string is required as Output to clear the input box after each input.
Different actions are performed depending on which input triggered the callback.
The value-attribute of input-checklist must be updates so that the values
of deleted elements no longer appear in the list of selected elements.
:param input_value: given by dcc.Input
:type input_value: string
:param btn1: signals pressing of clear-all-button
:type btn1: int
:param btn2: signals pressing of clear-selected-button
:type btn2: int
:param filecontents: the contents of an uploaded file
:type filecontents: bit-string
:param all_inputs: all labels and values from the checklist,
regardless if they have been checked or not
:type all_inputs: list of dictionaries with 2 entries each
:param selected_inputs: values of all checked elements
:type selected_inputs: list of strings
:param addtitional_options: all checked additional options
:type additional_options: list of strings
'''
# changed_id is used to determine which Input has triggered the callback
changed_id = [p['prop_id'] for p in callback_context.triggered][0]
# if clear-all-button was pressed:
if 'clear-all-button' in changed_id:
return list(),list(),'',''
# if clear-selected-button was pressed:
if 'clear-selected-button' in changed_id:
all_inputs = [i for i in all_inputs if i['value'] not in selected_inputs]
return all_inputs,list(),'',''
# when a new element is added via dcc.Input
if 'input-string' in changed_id:
# Creates a list of previously added inputs to make sure nothing is added twice
currValues = [x['value'] for x in all_inputs]
if input_value not in currValues:
# if 'Smart Input' is selected, the input will be checked for validity
# and a more readable string will be returned
if 'Smart Input' in additional_options:
try:
# Attempts to call get_publication. If unsuccesful,
# the DOI is not added and an error message is returned
i = InputInterface()
pub = i.get_pub_light(input_value)
except Exception as err:
return options,selected_inputs,'','{}'.format(err)
# Creates a more readable string to display in the checklist
rep_str = pub.contributors[0] + ',' + pub.journal + \
',' + pub.publication_date
all_inputs.append({'label':rep_str, 'value':input_value})
# if 'Smart Input' is not selected, the input value is added as is,
# without checking for validity.
else:
all_inputs.append({'label':input_value,'value':input_value})
return all_inputs,selected_inputs,'',''
# when a txt-file is uploaded
if 'upload-data.contents' in changed_id:
if filecontents:
# Skips the info portion that is added when a file is uploaded
found = base64.b64decode(re.search(',(.+?)$', filecontents).group(1))
# Returns the binary string into a proper text
text = found.decode('utf-8')
# Creates a list of inputs by splitting the lines
list_of_inputs = (text.strip().split('\n'))
CurrValues = [x['value'] for x in all_inputs]
# For every line the same actions as for a single input are performed
for input_value in list_of_inputs:
if input_value not in CurrValues:
if 'Smart Input' in additional_options:
try:
i = InputInterface()
pub = i.get_pub_light(input_value)
except Exception as err:
return all_inputs,selected_inputs,'','{}'.format(err)
rep_str = pub.contributors[0] + ',' + pub.journal + \
',' + pub.publication_date
all_inputs.append({'label':rep_str, 'value':input_value})
else:
all_inputs.append({'label':input_value,'value':input_value})
return all_inputs,selected_inputs,'',''
# when the programm is first started:
# if this is not done, the input_checklist will be generated
# with one element that contains an empty string
if input_value == '':
return list(),list(),'',''
@app.callback(
Output('info-box','children'),
Input('show-info','n_clicks')
)
def show_hide_info_box(n_clicks):
'''
This callback shows and hides the (first) info-box by, checking how often
the button has been pressed. The text was loaded at the top.
:param n_clicks: number of times show-info has been clicked.
'type n_clicks: int
'''
if n_clicks % 2 == 0:
return ''
else:
return html.Div(boxcontent, style={'whiteSpace': 'pre-line'})
@app.callback(
Output('test-output','children'),
Input('start-button','n_clicks'),
Input('input-checklist','options'),
Input('input-checklist','value'),
Input('forward-depth','value'),
Input('backward-depth','value'),
State('additional-options','value')
)
def generate_output(n_clicks,all_inputs,selected_inputs,
forward_depth,backward_depth,additional_options):
'''
Basic structure for a callback that generates an output. This is only a
proof of concept and has noting to do with the intended output yet.
:param n_clicks: how often has Generate Graph been clicked
:type n_clicks: int
:param all_inputs: all labels and values from the checklist,
regardless if they have been checked or not
:type all_inputs: list of dictionaries with 2 entries each
:param selected_inputs: values of all checked elements
:type selected_inputs: list of strings
:param forward_depth: forward recursion depth
:type forward_depth: unsigned int
:param backward_depth: backward recursion depth
:type backward_depth: unsigned int
:param additional_options: value of all selected additional options
:type additional_options: list of strings
'''
changed_id = [p['prop_id'] for p in callback_context.triggered][0]
if n_clicks is None:
raise PreventUpdate
elif 'Update Automatically' in additional_options \
or 'start-button' in changed_id:
s = ''
for i in range(len(all_inputs)):
x = all_inputs[i]['value']
if x in selected_inputs:
s += x*(abs(int(forward_depth)-int(backward_depth)))
else:
s += x*(int(forward_depth)+int(backward_depth))
return s
else:
raise PreventUpdate
if __name__ == '__main__':
app.run_server(debug=True)
#!/usr/bin/env python3
from input.interface import InputInterface as Input
def count_journals(url: str):
inter = Input()
pub = inter.get_publication(url)
if pub.citations:
for citation in pub.citations:
journal = citation.journal
if journal in cit:
cit[journal] += 1
else:
cit[journal] = 1
if pub.references:
for reference in pub.references:
journal = reference.journal
if journal in cit:
cit[journal] += 1
else:
cit[journal] = 1
if __name__ == "__main__":
cit = {}
count_journals("https://doi.org/10.1021/acs.jcim.1c00203")
count_journals("https://doi.org/10.1021/acs.jcim.6b00561")
count_journals("https://doi.org/10.1021/acs.jcim.6b00613")
count_journals("https://doi.org/10.1021/acs.jcim.1c00917")
count_journals("https://doi.org/10.1021/acs.jmedchem.0c01332")
#count_journals("https://pubs.acs.org/doi/10.1021/acs.biochem.1c00290")
#count_journals("https://pubs.acs.org/doi/10.1021/acsenvironau.1c00007")
#count_journals("https://pubs.acs.org/doi/10.1021/acs.biochem.7b01162")
cit = dict(sorted(cit.items(), key=lambda item: item[1]))
for journal in cit:
if journal != "":
print(f'{journal}: {cit[journal]}')
#!/usr/bin/env python3
from input.interface import InputInterface as Input
def main(url: str):
i = Input()
#print(i.get_publication(url))
print(i.get_pub_light(url))
# print(i.get_supported_fetchers()) Useless because all classes are called the same
if __name__ == "__main__":
#main("https://doi.org/10.1021/acs.jcim.1c0023")
main("https://doi.org/10.1021/acs.jcim.5b00332")
English
Show Info: Can be activated and deactivated by clicking on the button.
Input: input by entering a DOI ("Digital Object Identifier")
Drag and drop or click to select a file to upload: entering multiple DOI by txt-file is only possible if every DOI has its own line.
Recursion:
Clear All: clearing all inputs
Clear Selected: clearing all selected inputs
Generate Graph: generates the graph
Update Automatically: automatically updates the graph for every new input
Smart Input: checks the correctness of the entered DOI and shows a nicer depiction: Author, Journal, publication date.
German
Show Info: Durch wiederholtes klicken kann das Fenster ein und aus geblendet werden.
Input: Die Eingabe erfolgt in Form eines DOI ("Digital Object Identifier")
Drag and drop or click to select a file to upload: Mehrere DOI in einem txt-Dokument müssen untereinander angeordnet sein.
Recursion:
Clear All: alle Eingaben werden gelöscht
Clear Selected: alle markierten Eingaben werden gelöscht
Generate Graph: generiert den zugehörigen Graphen
Update Automatically: automatische Aktualisierung des Graphen nach neuer Eingabe
Smart Input: direkte Überprüfung der Eingabe auf Richtigkeit zudem wird nicht mehr der DOI angezeigt sondern: Der Autor, Das Journal, Das Veröffentlichungsdatum.
# Projekt CiS-Projekt 2021/22
Input-Package to fetch publication information with a given url.
## Usage/Examples
```python
from input.interface import InputInterface as Input
from input.publication import Publication
def main(url):
inter = Input()
try:
pub = inter.get_publication(url)
except Exception as error:
raise error
print(pub)
pub.title = "Cool new Title"
print(pub)
if __name__ == "__main__":
main("https://doi.org/10.1021/acs.chemrev.8b00728")
```
The expected results of calling this methode are:
| Input-Url | Result |
|-----------|-----------|
| supported & correct| A publication Instance |
| supported & uncorrect| ValueError|
| not supported | ValueError|
Supported Url are urls, which comply with the url-pattern of supported Journals.
### Supported Journals:
- ACS-Journals
- (Nature-Journals)
## Testing
``` c
python -m unittest input/test/<file.py> -v
# for all tests in directory
python -m unittest discover input/test -v
```
## Authors
- Florian Jochens
- Sam Ockenden
- Julius Schenk
\ No newline at end of file
File added
File added
File added
File added
#!/usr/bin/env python3
"""
Child class of JournalFetcher
Usage: Check if Url can be used with 'can_use_url'
and then fetch publication with 'get_publication'
"""
import re
from input.get.journal_fetcher import JournalFetcher
from input.publication import Publication, Citation
class Fetcher(JournalFetcher):
"""
Specific Fetcher for the ACS journals.
"""
# Constant for the abbreviations of the supported Journals
SUPPORTED_JOURNALS = ['1021']
@staticmethod
def can_use_url(url: str) -> str:
"""
Uses Regex to extract journal specific substrings in Doi.
TODO: Support non Doi-urls
"""
matched_url = re.match(r'^(https?://)?(doi.org/|pubs.acs.org/doi/)?(10.(\d{4})/\w+.\S+)', url.strip(". \t\r\n"))
#Checks if match exists
if matched_url is not None:
return matched_url[4] in Fetcher.SUPPORTED_JOURNALS
else:
return False
@staticmethod
def get_pub_light(url: str) -> Publication:
"""
Fetches html and creates Beatifulsoup-instance in parent class.
Specific css-searches for ACS-Journals and creates Publication-instance.
"""
# Creation of Soup
try:
soup = JournalFetcher.get_soup(url)
except Exception as error:
raise error
# Raise Error if re recognizes Pattern, but url isnt correct:
# For other Urls
if soup.text.strip(" \t\n")=="Missing resource null":
raise ValueError("'{}' matches Pattern for 'ACS', but doesnt link to Paper.".format(url))
# For Dois
if soup.title is not None:
if soup.title.text == "Error: DOI Not Found":
raise ValueError("'{}' matches Pattern for 'ACS', but doesnt link to Paper.".format(url))
soup_header = soup.select('.article_header')[0]
# Creates Publication
doi_url = soup_header.select('a[title="DOI URL"]')[0].string
title = soup_header.select(".hlFld-Title")[0].text
contributors = []
for author in soup_header.select(".hlFld-ContribAuthor"):
contributors.append(author.text)
journal = soup_header.select(".cit-title")[0].text
# Replaces abbreviation with whole name
if journal in JournalFetcher.abbrev_dict:
journal = JournalFetcher.abbrev_dict[journal]
published = soup_header.select(".pub-date-value")[0].text
subjects = []
subject_soup = soup_header.select('.article_header-taxonomy')[0]
for subject in subject_soup.select('a'):
subjects.append(subject.text)
return Publication(doi_url, title, contributors, journal, published,
subjects)
def get_publication(url: str) -> Publication:
"""
Fetches html and creates Beatifulsoup-instance in parent class.
Specific css-searches for ACS-Journals and creates Publication-instance.
"""
# Creation of Soup
try:
soup = JournalFetcher.get_soup(url)
except Exception as error:
raise error
# Raise Error if re recognizes Pattern, but url isnt correct:
# For other Urls
if soup.text.strip(" \t\n")=="Missing resource null":
raise ValueError("'{}' matches Pattern for 'ACS', but doesnt link to Paper.".format(url))
# For Dois
if soup.title is not None:
if soup.title.text == "Error: DOI Not Found":
raise ValueError("'{}' matches Pattern for 'ACS', but doesnt link to Paper.".format(url))
soup_header = soup.select('.article_header')[0]
#Could be used for more specific search
ref_cit_soup = soup
# Creates Publication
doi_url = soup_header.select('a[title="DOI URL"]')[0].string
title = soup_header.select(".hlFld-Title")[0].text
contributors = []
for author in soup_header.select(".hlFld-ContribAuthor"):
contributors.append(author.text)
journal = soup_header.select(".cit-title")[0].text
# Replaces abbreviation with whole name
if journal in JournalFetcher.abbrev_dict:
journal = JournalFetcher.abbrev_dict[journal]
published = soup_header.select(".pub-date-value")[0].text
subjects = []
subject_soup = soup_header.select('.article_header-taxonomy')[0]
for subject in subject_soup.select('a'):
subjects.append(subject.text)
references = []
references_soup = ref_cit_soup.select('ol#references')
if references_soup != []:
for reference in references_soup[0].select('li'):
if reference.select('.refDoi') != []:
ref_doi = "https://doi.org/{}".format(reference.select('.refDoi')[0].text.strip()[5:])
else:
# No Doi -> No Paper
continue
ref_title = reference.select('.NLM_article-title')[0].text\
if reference.select('.NLM_article-title') != [] else None
ref_journal = reference.select('i')[0].text\
if reference.select('i') != [] else None
# Replaces abbreviation with whole name
if ref_journal in JournalFetcher.abbrev_dict:
ref_journal = JournalFetcher.abbrev_dict[ref_journal]
ref_contributors=[]
for author in reference.select('.NLM_contrib-group'):
ref_contributors.append(author.text.replace("\n", " ").replace("\r", ""))
references.append(Citation(ref_doi, ref_title, ref_journal, ref_contributors, cit_type="Reference"))
citations = []
citation_soup = ref_cit_soup.select('.cited-content_cbyCitation')
if citation_soup != []:
for citation in citation_soup[0].select('li'):
if citation.select('a[title="DOI URL"]') != []:
cit_doi = citation.select('a[title="DOI URL"]')[0].text
else:
# No Doi -> No Paper
continue
cit_title = citation.select('.cited-content_cbyCitation_article-title')[0].text\
if citation.select('.cited-content_cbyCitation_article-title')!= [] else None
cit_journal = citation.select('.cited-content_cbyCitation_journal-name')[0].text\
if citation.select('.cited-content_cbyCitation_journal-name') != [] else None
# Replaces abbreviation with whole name
if cit_journal in JournalFetcher.abbrev_dict:
cit_journal = JournalFetcher.abbrev_dict[cit_journal]
cit_contributors =[]
cit_contributors = citation.select('.cited-content_cbyCitation_article-contributors')[0]\
.text.replace("\n", " ").replace("\r", "").split(', ')
# clean up of the last Entry
cit_contributors_last = cit_contributors.pop().strip(". ")
if cit_contributors_last != '':
cit_contributors.append(cit_contributors_last)
citations.append(Citation(cit_doi, cit_title, cit_journal, cit_contributors, cit_type = "Citation"))
return Publication(doi_url, title, contributors, journal, published
, subjects, references, citations)
#!/usr/bin/env python3
"""
Parent class for specific Journal
"""
from abc import ABCMeta, abstractmethod
from bs4 import BeautifulSoup
import requests
from input.publication import Publication
class JournalFetcher(metaclass=ABCMeta):
"""
This is a abstract-class for fetcher modules
"""
@staticmethod
def get_soup(url: str) -> BeautifulSoup:
"""
Retrieves webside-html and returns a BeautifulSoup-instance
Parameters:
-----------
:type url: str
:param url: doi-url to a publication
:return: BeatifulSoup-instance
"""
try:
req = requests.get(url)
except requests.exceptions.HTTPError as err:
raise SystemExit(err)
return BeautifulSoup(req.content, 'html.parser')
@staticmethod
@abstractmethod
def can_use_url(url: str) -> bool:
"""
Abstract-function to be implemented in subclass.
Checks if given url links to a supported journal
"""
raise AttributeError("JournalFetcher for '{}' hasnt implemented 'can_use_url()'".format(url))
@staticmethod
@abstractmethod
def get_publication(url: str) -> Publication:
"""
Abstract-function to be implemented in subclass.
Creates a Publication-instance.
"""
raise AttributeError("JournalFetcher for '{}' hasnt implemented 'get_publication()'".format(url))
# A Dictionary, which connects abbreviation to whole journal-name
abbrev_dict = {
"Nat. Protoc.":"Journal of Natural Products"
,"PLoS Comput. Biol.":"PLoS Computational Biology"
,"PLoS One":"PLoS One"
,"Protein Sci.":"Protein Science"
,"J. Am. Chem. Soc.":"Journal of the American Chemical Society"
,"J. Chem. Phys.":"Journal of Chemical Physics"
,"Appl. Sci.":"Applied Science"
,"Comput. Sci. Eng.":"Computing in Science & Engineering"
,"Beilstein J. Org. Chem.":"Beilstein Journal of Organic Chemistry"
,"Biol. Chem.":"Biological Chemistry"
,"Isr. J. Chem.":"Israel Journal of Chemistry"
,"Nat. Methods":"Nature Methods"
,"Proc. Natl. Acad. Sci. U. S. A.":"Proceedings of the National Academy of Sciences of the United States of America"
,"J. Phys. Chem. B":"Journal of Physical Chemistry B"
,"Carbohydr. Res.":"Carbohydrate Research"
,"J. Chem. Theory Comput.":"Journal of Chemical Theory and Computation"
,"J. Mol. Biol.":"Journal of Molecular Biology"
,"Nucleic Acids Res.":"Nucleic Acids Research"
,"J. Comput. Chem.":"Journal of Computational Chemistry"
,"J. Cheminf.":"Journal of Cheminformatics"
,"J. Med. Chem.":"Journal of Medicinal Chemistry"
,"J. Comput.-Aided Mol. Des.":"Journal of Computer-Aided Molecular Design"
,"J. Chem. Inf. Model.":"Journal of Chemical Information and Modeling"
,"Mol. Cell":"Molecular Cell"
,"J. Cell Biolog.":"Journal of Cell Biology"
,"Mol. Cell Biol.":"Molecular and Cellular Biology"
,"J. Cell Sci.":"Journal of Cell Science"
,"Nat. Cell Biol.":"Nature Cell Biology"
,"J. Aerosol Sci. Technol.":"Aerosol Science and Technology"
,"Mol. Biol. Cell":"Molecular Biology of the Cell"
,"Build. Environ.":"Building and Environment"
,"Sci. Rep.":"Scientific Reports"
,"Nat. Chem.":"Nature Chemistry"
,"Nat. Med.":"Nature Medicine"
,"Nat. Commun.":"Nature Communications"
,"Exp. Cell Res.":"Experimental Cell Research"
,"Nat. Chem. Biol.":"Nature Chemical Biology"
}
\ No newline at end of file
#!/usr/bin/env python3
"""
Child class of JournalFetcher
Usage: Check if Url can be used with 'can_use_url'
and then fetch publication with 'get_publication'
"""
# import re
from input.get.journal_fetcher import JournalFetcher
from input.publication import Publication
class Fetcher(JournalFetcher):
"""
scrapes publication metadata from a provided url
"""
# TODO: List of Compatable Journals
# NOTE: nature does not use journal names in doi links, must match by 10.xxxx identifier instead
SUPPORTED_JOURNALS = []
@staticmethod
def can_use_url(url: str) -> bool:
"""
Checks if given url links to a supported journal.
"""
# TODO: Check the URL for compatability
# re.match in SUPPORTED_JOURNALS
return False
@staticmethod
def get_publication(url: str) -> Publication:
"""
Creates a Publication-instance.
"""
soup = JournalFetcher.get_soup(url)
_doi_url = "https://doi.org/" + soup.head.find(attrs={"name": "DOI"}).get("content")
_title = soup.head.find(attrs={"name": "citation_title"}).get("content")
_journal = soup.head.find(attrs={"name": "citation_journal_title"}).get("content")
_published = soup.head.find(attrs={"name": "prism.publicationDate"}).get("content")
_contributors = []
_subjects = []
for creator in soup.head.findAll(attrs={"name": "dc.creator"}):
_contributors.append(creator.get("content"))
for subject in soup.head.findAll(attrs={"name": "dc.subject"}):
_subjects.append(subject.get("content"))
return Publication(_doi_url, _title, _contributors, _journal, _published, _subjects)
# TODO: Exceptions-handling
# raise ValueException("Cant Fetch: '{}'".format(error))
# return None
#!/usr/bin/env python3
"""
Child class of JournalFetcher
Usage: None, this is just a template and should be ignored
"""
# import re
from input.get.journal_fetcher import JournalFetcher
from input.publication import Publication
class Fetcher(JournalFetcher):
"""
This is only a template and therefore has no functionality
"""
# TODO: Naming-Convention:
# Class: 'Fetcher'
# file: [journal-/organisation-name]
# format = "[a-z]*.py" allowed
# TODO: List of Compatable Journals
SUPPORTED_JOURNALS = []
@staticmethod
def can_use_url(url: str) -> bool:
"""
Checks if given url links to a supported journal.
"""
# TODO: Check the URL for compatability
# url_re = re.match(r'(https?://)?(doi.org/)?(10.(\d{4})/\w+.\S+)', url)
# if url_re is not None:
# return url_re[4] in SUPPORTED_JOURNALS
# else:
return False
@staticmethod
def get_publication(url: str) -> Publication:
"""
Creates a Publication-instance.
"""
# TODO: Fetch data from the HTML
# soup = JournalFetcher.get_soup(url)
# doi,title,contributors[],journal,publication_date,subjects[],references[],citations[]
# TODO: Create new Publication-instance
# return Publication(doi_url, title, contributors = [], journal
# , publication_date, subjects = [], references = [], citations = [])
return None
\ No newline at end of file
#!/usr/bin/env python3
"""
Interface for the Input-Package only this should be accessed from outside this Package.
"""
from os import walk
import importlib
import pathlib
import re
from input.publication import Publication
class InputInterface:
"""
Singleton which dynamically imports and manages fetchers
"""
instance = None
get_path = None
fetcher_classes=[]
# '__new__' is called before '__init__' and gives us an instance
def __new__(cls, *args, **kwargs):
# checks if an instance exists and if it doesnt creates one
if cls.instance == None:
cls.instance = super(InputInterface, cls).__new__(cls,*args, **kwargs)
return cls.instance
def __init__(self):
# imports all modules
if self.fetcher_classes ==[]:
self.import_fetcher_classes()
if self.fetcher_classes ==[]:
raise AttributeError("No specific Fetchers where found at: '{}'"
.format(self.get_path))
def get_publication(self, url: str) -> Publication:
"""
The interface-method to get a Publication-instance
(including it's citations and references)
Parameters
----------
:param url: url to a Publication
:type url: str
:return: Publication instance or None if not supported
"""
# Checks if module supports the 'url' and
# returns a Publication if it does.
for fetcher_class in InputInterface.fetcher_classes:
if fetcher_class.can_use_url(url):
return fetcher_class.get_publication(url)
# No Module for given url was found
raise ValueError("'{}' is not supported".format(url))
def get_pub_light(self, url: str) -> Publication:
"""
The interface-method to get a Publication-instance
(only for main article)
Parameters
----------
:param url: url to a Publication
:type url: str
:return: Publication instance or None if not supported
"""
# Checks if module supports the 'url' and
# returns a Publication if it does.
for fetcher_class in InputInterface.fetcher_classes:
if fetcher_class.can_use_url(url):
return fetcher_class.get_pub_light(url)
# No Module for given url was found
raise ValueError("'{}' is not supported".format(url))
def get_supported_fetchers(self):
# print(self.fetcher_classes[0].__name__) Useless right now,
# because all classes are called the same
return [a.__name__ for a in self.fetcher_classes]
def import_fetcher_classes(self):
"""
Searches in 'get', if there are [a-z]*.py modules (specific Fetchers)
and tries to import them.
Saves found modules in 'fetcher_files'.
"""
# Path to 'get'-package
self.get_path = '{}/get'.format(pathlib.Path(__file__).parent.resolve())
# Searches for modules with given Pattern
fetcher_file_names=[]
for file in next(walk(self.get_path), (None, None, []))[2]:
if re.match(r'[a-z]+.py', file) is not None:
fetcher_file_names.append(file)
# Tries to import those modules and saves their 'Fetcher'-class
for file in fetcher_file_names:
try:
fetcher_class = importlib.import_module("input.get.{}".format(file[:-3]))
try:
self.fetcher_classes.append(fetcher_class.__getattribute__('Fetcher'))
except Exception as error:
ImportError("Module '{}' does not have a 'Fetcher'-class".format(file[:-3]))
except Exception:
raise ImportError("Module '{}' can not be imported".format(file[:-3]))
#!/usr/bin/env python3
# this is needed for typing pre python 3.9, this maybe as an large Overhead
from typing import Any, List
class Publication:
"""
Represents a Publications
"""
def __init__(self, doi_url: str, title: str \
, contributors: List[str], journal: str \
, publication_date: str, subjects: List[str]\
, references: List[Any] = None, citations: List[Any] = None ):
"""
Parameters
----------
:param doi_url: doi_url of the publication
:type doi_url: str
:param title: title of the publication
:type title: str
:param contributors:list of all contributors
:type contributors: list[]
:param published: date of release
:type published: str
:param subjects: the subject of the Publication
:type subjects: List[str]
:param references: the Citation which is been referenced by this Publication
:type references: List[Any]
:param citations: the Citation which references this Publication
:type citations: List[Any]
:return: None
"""
self.doi_url = doi_url
self.title = title
self.contributors = contributors
self.journal = journal
self.publication_date = publication_date
self.subjects = subjects
if references is None:
self.references = []
else:
self.references = references
if citations is None:
self.citations = []
else:
self.citations = citations
# For the 'Verarbeitungsgruppe'
self.group = None
def __str__(self) -> str:
return ("Title: {}\n"
"Doi-url: {}\n"
"Authors: {}\n"
"Journal: {}\n"
"Published on: {}\n"
"Subjects: {}\n"
"References: \n{}\n"
"Citations: \n{}")\
.format(self.title, self.doi_url, ", ".join(self.contributors)
, self.journal, self.publication_date
, ", ".join(self.subjects)
, "\n".join(self.get_citation_string(self.references))
, "\n".join(self.get_citation_string(self.citations)))
@staticmethod
def get_citation_string(citations):
if citations == []:
return ["None"]
else:
citation_string = []
for citation in citations:
citation_string.append(citation.__str__())
return citation_string
def add_citations(self, citation) -> None:
"""
Appends a list of Citations or Citation to self.citations.
Parameter
---------
:param citation: Citation or Reference of the Publication
:type citation: Citation or list[Citation]
:return: self.citations
"""
if type(citation) is Citation:
self.citations.append(citation)
# Checks if 'citation' is a list of Citations
elif type(citation) is list:
for _cit in citation:
if type(_cit) is Citation:
self.citations.append(_cit)
else:
raise TypeError("_set_citation expects Citations or List of Citations, not: '{}'"
.format(type(_cit)))
else:
raise TypeError("_set_citation expects Citations or List of Citations, not: '{}'"
.format(type(citation)))
return self.citations
def __eq__(self, other) -> bool:
""" Compares the unique doi_url of two Publications"""
if type(self)==type(other):
return self.doi_url == other.doi_url
return False
class Citation:
def __init__(self, doi_url: str, title: str \
, journal: str, contributors: List[str] \
, cit_type: str = "Citation"):
"""
Parameters
----------
:param doi_url: doi_url of the publication
:type doi_url: str
:param title: title of the publication
:type title: str
:param contributors: list of all contributors
:type contributors: List[str]
:param cit_type: Specifies if Reference or Citation
:type cit_type: str
:return: None
"""
self.title = title
self.doi_url = doi_url
self.journal = journal
self.contributors = contributors
self.cit_type = cit_type
def __str__(self) -> str:
return ("\t{}-Title: {}\n"
"\t{}-Doi: {}\n"
"\t{}-Journal: {}\n"
"\t{}-Contributors: {}\n")\
.format(self.cit_type, self.title
, self.cit_type, self.doi_url
, self.cit_type, self.journal
, self.cit_type, ", ".join(self.contributors))
beautifulsoup4
requests
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment