diff --git a/count_journal.py b/count_journal.py new file mode 100755 index 0000000000000000000000000000000000000000..13886a2e7badf339bdd23475f7d3de713329f472 --- /dev/null +++ b/count_journal.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 + +from input.interface import InputInterface as Input + +def count_journals(url: str): + inter = Input() + pub = inter.get_publication(url) + + if pub.citations: + for citation in pub.citations: + journal = citation.journal + if journal in cit: + cit[journal] += 1 + else: + cit[journal] = 1 + + if pub.references: + for reference in pub.references: + journal = reference.journal + if journal in cit: + cit[journal] += 1 + else: + cit[journal] = 1 + +if __name__ == "__main__": + cit = {} + + count_journals("https://doi.org/10.1021/acs.jcim.1c00203") + count_journals("https://doi.org/10.1021/acs.jcim.6b00561") + count_journals("https://doi.org/10.1021/acs.jcim.6b00613") + count_journals("https://doi.org/10.1021/acs.jcim.1c00917") + count_journals("https://doi.org/10.1021/acs.jmedchem.0c01332") + #count_journals("https://pubs.acs.org/doi/10.1021/acs.biochem.1c00290") + #count_journals("https://pubs.acs.org/doi/10.1021/acsenvironau.1c00007") + #count_journals("https://pubs.acs.org/doi/10.1021/acs.biochem.7b01162") + + cit = dict(sorted(cit.items(), key=lambda item: item[1])) + for journal in cit: + if journal != "": + print(f'{journal}: {cit[journal]}') diff --git a/example_input.py b/example_input.py new file mode 100755 index 0000000000000000000000000000000000000000..c9bca4189fce4c1fd0a0dfc42ef4e517baa5f406 --- /dev/null +++ b/example_input.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 + +from input.interface import InputInterface as Input + +def main(url: str): + i = Input() + #print(i.get_publication(url)) + print(i.get_pub_light(url)) + # print(i.get_supported_fetchers()) Useless because all classes are called the same + +if __name__ == "__main__": + #main("https://doi.org/10.1021/acs.jcim.1c0023") + main("https://doi.org/10.1021/acs.jcim.5b00332") diff --git a/input/README.md b/input/README.md new file mode 100644 index 0000000000000000000000000000000000000000..110ce69136a8935b83d070113130222f243e924f --- /dev/null +++ b/input/README.md @@ -0,0 +1,50 @@ +# Projekt CiS-Projekt 2021/22 + +Input-Package to fetch publication information with a given url. + +## Usage/Examples + +```python +from input.interface import InputInterface as Input +from input.publication import Publication + +def main(url): + inter = Input() + try: + pub = inter.get_publication(url) + except Exception as error: + raise error + + print(pub) + pub.title = "Cool new Title" + print(pub) + +if __name__ == "__main__": + main("https://doi.org/10.1021/acs.chemrev.8b00728") +``` + +The expected results of calling this methode are: +| Input-Url | Result | +|-----------|-----------| +| supported & correct| A publication Instance | +| supported & uncorrect| ValueError| +| not supported | ValueError| + +Supported Url are urls, which comply with the url-pattern of supported Journals. + +### Supported Journals: + +- ACS-Journals +- (Nature-Journals) + +## Testing + +``` c +python -m unittest input/test/<file.py> -v +# for all tests in directory +python -m unittest discover input/test -v +``` +## Authors +- Florian Jochens +- Sam Ockenden +- Julius Schenk \ No newline at end of file diff --git a/input/__init__.py b/input/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/input/get/__init__.py b/input/get/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/input/get/acs.py b/input/get/acs.py new file mode 100755 index 0000000000000000000000000000000000000000..9691845b27ae694a8213a0f0fe5f827c75890eee --- /dev/null +++ b/input/get/acs.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 + +""" +Child class of JournalFetcher +Usage: Check if Url can be used with 'can_use_url' + and then fetch publication with 'get_publication' +""" + +import re + +from input.get.journal_fetcher import JournalFetcher +from input.publication import Publication, Citation + + +class Fetcher(JournalFetcher): + """ + Specific Fetcher for the ACS journals. + """ + + # Constant for the abbreviations of the supported Journals + SUPPORTED_JOURNALS = ['1021'] + + @staticmethod + def can_use_url(url: str) -> str: + """ + Uses Regex to extract journal specific substrings in Doi. + TODO: Support non Doi-urls + """ + matched_url = re.match(r'^(https?://)?(doi.org/|pubs.acs.org/doi/)?(10.(\d{4})/\w+.\S+)', url.strip(". \t\r\n")) + + #Checks if match exists + if matched_url is not None: + return matched_url[4] in Fetcher.SUPPORTED_JOURNALS + else: + return False + + @staticmethod + + + def get_pub_light(url: str) -> Publication: + """ + Fetches html and creates Beatifulsoup-instance in parent class. + Specific css-searches for ACS-Journals and creates Publication-instance. + """ + + # Creation of Soup + try: + soup = JournalFetcher.get_soup(url) + except Exception as error: + raise error + + # Raise Error if re recognizes Pattern, but url isnt correct: + # For other Urls + if soup.text.strip(" \t\n")=="Missing resource null": + raise ValueError("'{}' matches Pattern for 'ACS', but doesnt link to Paper.".format(url)) + + # For Dois + if soup.title is not None: + if soup.title.text == "Error: DOI Not Found": + raise ValueError("'{}' matches Pattern for 'ACS', but doesnt link to Paper.".format(url)) + + + soup_header = soup.select('.article_header')[0] + + # Creates Publication + doi_url = soup_header.select('a[title="DOI URL"]')[0].string + title = soup_header.select(".hlFld-Title")[0].text + + contributors = [] + for author in soup_header.select(".hlFld-ContribAuthor"): + contributors.append(author.text) + + journal = soup_header.select(".cit-title")[0].text + + # Replaces abbreviation with whole name + if journal in JournalFetcher.abbrev_dict: + journal = JournalFetcher.abbrev_dict[journal] + + + published = soup_header.select(".pub-date-value")[0].text + + subjects = [] + subject_soup = soup_header.select('.article_header-taxonomy')[0] + for subject in subject_soup.select('a'): + subjects.append(subject.text) + + return Publication(doi_url, title, contributors, journal, published, + subjects) + + def get_publication(url: str) -> Publication: + """ + Fetches html and creates Beatifulsoup-instance in parent class. + Specific css-searches for ACS-Journals and creates Publication-instance. + """ + + # Creation of Soup + try: + soup = JournalFetcher.get_soup(url) + except Exception as error: + raise error + + # Raise Error if re recognizes Pattern, but url isnt correct: + # For other Urls + if soup.text.strip(" \t\n")=="Missing resource null": + raise ValueError("'{}' matches Pattern for 'ACS', but doesnt link to Paper.".format(url)) + + # For Dois + if soup.title is not None: + if soup.title.text == "Error: DOI Not Found": + raise ValueError("'{}' matches Pattern for 'ACS', but doesnt link to Paper.".format(url)) + + + soup_header = soup.select('.article_header')[0] + + #Could be used for more specific search + ref_cit_soup = soup + + # Creates Publication + doi_url = soup_header.select('a[title="DOI URL"]')[0].string + title = soup_header.select(".hlFld-Title")[0].text + + contributors = [] + for author in soup_header.select(".hlFld-ContribAuthor"): + contributors.append(author.text) + + journal = soup_header.select(".cit-title")[0].text + + # Replaces abbreviation with whole name + if journal in JournalFetcher.abbrev_dict: + journal = JournalFetcher.abbrev_dict[journal] + + + published = soup_header.select(".pub-date-value")[0].text + + subjects = [] + subject_soup = soup_header.select('.article_header-taxonomy')[0] + for subject in subject_soup.select('a'): + subjects.append(subject.text) + + + references = [] + references_soup = ref_cit_soup.select('ol#references') + if references_soup != []: + for reference in references_soup[0].select('li'): + if reference.select('.refDoi') != []: + ref_doi = "https://doi.org/{}".format(reference.select('.refDoi')[0].text.strip()[5:]) + else: + # No Doi -> No Paper + continue + ref_title = reference.select('.NLM_article-title')[0].text\ + if reference.select('.NLM_article-title') != [] else None + ref_journal = reference.select('i')[0].text\ + if reference.select('i') != [] else None + + # Replaces abbreviation with whole name + if ref_journal in JournalFetcher.abbrev_dict: + ref_journal = JournalFetcher.abbrev_dict[ref_journal] + + ref_contributors=[] + for author in reference.select('.NLM_contrib-group'): + ref_contributors.append(author.text.replace("\n", " ").replace("\r", "")) + + references.append(Citation(ref_doi, ref_title, ref_journal, ref_contributors, cit_type="Reference")) + + citations = [] + citation_soup = ref_cit_soup.select('.cited-content_cbyCitation') + if citation_soup != []: + for citation in citation_soup[0].select('li'): + if citation.select('a[title="DOI URL"]') != []: + cit_doi = citation.select('a[title="DOI URL"]')[0].text + else: + # No Doi -> No Paper + continue + cit_title = citation.select('.cited-content_cbyCitation_article-title')[0].text\ + if citation.select('.cited-content_cbyCitation_article-title')!= [] else None + cit_journal = citation.select('.cited-content_cbyCitation_journal-name')[0].text\ + if citation.select('.cited-content_cbyCitation_journal-name') != [] else None + + # Replaces abbreviation with whole name + if cit_journal in JournalFetcher.abbrev_dict: + cit_journal = JournalFetcher.abbrev_dict[cit_journal] + cit_contributors =[] + cit_contributors = citation.select('.cited-content_cbyCitation_article-contributors')[0]\ + .text.replace("\n", " ").replace("\r", "").split(', ') + # clean up of the last Entry + cit_contributors_last = cit_contributors.pop().strip(". ") + if cit_contributors_last != '': + cit_contributors.append(cit_contributors_last) + citations.append(Citation(cit_doi, cit_title, cit_journal, cit_contributors, cit_type = "Citation")) + + return Publication(doi_url, title, contributors, journal, published + , subjects, references, citations) diff --git a/input/get/journal_fetcher.py b/input/get/journal_fetcher.py new file mode 100755 index 0000000000000000000000000000000000000000..514af1f80f5c7d442b790aebf5fe3954d50f8f5d --- /dev/null +++ b/input/get/journal_fetcher.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 + +""" +Parent class for specific Journal +""" + +from abc import ABCMeta, abstractmethod +from bs4 import BeautifulSoup +import requests +from input.publication import Publication + + +class JournalFetcher(metaclass=ABCMeta): + """ + This is a abstract-class for fetcher modules + """ + + @staticmethod + def get_soup(url: str) -> BeautifulSoup: + """ + Retrieves webside-html and returns a BeautifulSoup-instance + + Parameters: + ----------- + :type url: str + :param url: doi-url to a publication + :return: BeatifulSoup-instance + """ + try: + req = requests.get(url) + except requests.exceptions.HTTPError as err: + raise SystemExit(err) + + return BeautifulSoup(req.content, 'html.parser') + + + @staticmethod + @abstractmethod + def can_use_url(url: str) -> bool: + """ + Abstract-function to be implemented in subclass. + Checks if given url links to a supported journal + """ + raise AttributeError("JournalFetcher for '{}' hasnt implemented 'can_use_url()'".format(url)) + + + @staticmethod + @abstractmethod + def get_publication(url: str) -> Publication: + """ + Abstract-function to be implemented in subclass. + Creates a Publication-instance. + """ + raise AttributeError("JournalFetcher for '{}' hasnt implemented 'get_publication()'".format(url)) + + + # A Dictionary, which connects abbreviation to whole journal-name + abbrev_dict = { + "Nat. Protoc.":"Journal of Natural Products" + ,"PLoS Comput. Biol.":"PLoS Computational Biology" + ,"PLoS One":"PLoS One" + ,"Protein Sci.":"Protein Science" + ,"J. Am. Chem. Soc.":"Journal of the American Chemical Society" + ,"J. Chem. Phys.":"Journal of Chemical Physics" + ,"Appl. Sci.":"Applied Science" + ,"Comput. Sci. Eng.":"Computing in Science & Engineering" + ,"Beilstein J. Org. Chem.":"Beilstein Journal of Organic Chemistry" + ,"Biol. Chem.":"Biological Chemistry" + ,"Isr. J. Chem.":"Israel Journal of Chemistry" + ,"Nat. Methods":"Nature Methods" + ,"Proc. Natl. Acad. Sci. U. S. A.":"Proceedings of the National Academy of Sciences of the United States of America" + ,"J. Phys. Chem. B":"Journal of Physical Chemistry B" + ,"Carbohydr. Res.":"Carbohydrate Research" + ,"J. Chem. Theory Comput.":"Journal of Chemical Theory and Computation" + ,"J. Mol. Biol.":"Journal of Molecular Biology" + ,"Nucleic Acids Res.":"Nucleic Acids Research" + ,"J. Comput. Chem.":"Journal of Computational Chemistry" + ,"J. Cheminf.":"Journal of Cheminformatics" + ,"J. Med. Chem.":"Journal of Medicinal Chemistry" + ,"J. Comput.-Aided Mol. Des.":"Journal of Computer-Aided Molecular Design" + ,"J. Chem. Inf. Model.":"Journal of Chemical Information and Modeling" + ,"Mol. Cell":"Molecular Cell" + ,"J. Cell Biolog.":"Journal of Cell Biology" + ,"Mol. Cell Biol.":"Molecular and Cellular Biology" + ,"J. Cell Sci.":"Journal of Cell Science" + ,"Nat. Cell Biol.":"Nature Cell Biology" + ,"J. Aerosol Sci. Technol.":"Aerosol Science and Technology" + ,"Mol. Biol. Cell":"Molecular Biology of the Cell" + ,"Build. Environ.":"Building and Environment" + ,"Sci. Rep.":"Scientific Reports" + ,"Nat. Chem.":"Nature Chemistry" + ,"Nat. Med.":"Nature Medicine" + ,"Nat. Commun.":"Nature Communications" + ,"Exp. Cell Res.":"Experimental Cell Research" + ,"Nat. Chem. Biol.":"Nature Chemical Biology" + } \ No newline at end of file diff --git a/input/get/nature.py b/input/get/nature.py new file mode 100644 index 0000000000000000000000000000000000000000..c50ea0ef9d1d4a9a386730e31cc72372cbf698c0 --- /dev/null +++ b/input/get/nature.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 + +""" +Child class of JournalFetcher +Usage: Check if Url can be used with 'can_use_url' + and then fetch publication with 'get_publication' +""" + +# import re +from input.get.journal_fetcher import JournalFetcher +from input.publication import Publication + + +class Fetcher(JournalFetcher): + + """ + scrapes publication metadata from a provided url + """ + + # TODO: List of Compatable Journals + # NOTE: nature does not use journal names in doi links, must match by 10.xxxx identifier instead + SUPPORTED_JOURNALS = [] + + @staticmethod + def can_use_url(url: str) -> bool: + """ + Checks if given url links to a supported journal. + """ + + # TODO: Check the URL for compatability + # re.match in SUPPORTED_JOURNALS + return False + + @staticmethod + def get_publication(url: str) -> Publication: + """ + Creates a Publication-instance. + """ + + soup = JournalFetcher.get_soup(url) + + _doi_url = "https://doi.org/" + soup.head.find(attrs={"name": "DOI"}).get("content") + _title = soup.head.find(attrs={"name": "citation_title"}).get("content") + _journal = soup.head.find(attrs={"name": "citation_journal_title"}).get("content") + _published = soup.head.find(attrs={"name": "prism.publicationDate"}).get("content") + _contributors = [] + _subjects = [] + + for creator in soup.head.findAll(attrs={"name": "dc.creator"}): + _contributors.append(creator.get("content")) + + for subject in soup.head.findAll(attrs={"name": "dc.subject"}): + _subjects.append(subject.get("content")) + + return Publication(_doi_url, _title, _contributors, _journal, _published, _subjects) + + # TODO: Exceptions-handling + # raise ValueException("Cant Fetch: '{}'".format(error)) + # return None diff --git a/input/get/template_.py b/input/get/template_.py new file mode 100755 index 0000000000000000000000000000000000000000..58de0237bd514f7dd1b5b25f251b740d33e3589e --- /dev/null +++ b/input/get/template_.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 + +""" +Child class of JournalFetcher +Usage: None, this is just a template and should be ignored +""" + +# import re +from input.get.journal_fetcher import JournalFetcher +from input.publication import Publication + + +class Fetcher(JournalFetcher): + + """ + This is only a template and therefore has no functionality + """ + + # TODO: Naming-Convention: + # Class: 'Fetcher' + # file: [journal-/organisation-name] + # format = "[a-z]*.py" allowed + # TODO: List of Compatable Journals + SUPPORTED_JOURNALS = [] + + @staticmethod + def can_use_url(url: str) -> bool: + """ + Checks if given url links to a supported journal. + """ + + # TODO: Check the URL for compatability + # url_re = re.match(r'(https?://)?(doi.org/)?(10.(\d{4})/\w+.\S+)', url) + # if url_re is not None: + # return url_re[4] in SUPPORTED_JOURNALS + # else: + return False + + @staticmethod + def get_publication(url: str) -> Publication: + """ + Creates a Publication-instance. + """ + + # TODO: Fetch data from the HTML + # soup = JournalFetcher.get_soup(url) + # doi,title,contributors[],journal,publication_date,subjects[],references[],citations[] + # TODO: Create new Publication-instance + # return Publication(doi_url, title, contributors = [], journal + # , publication_date, subjects = [], references = [], citations = []) + return None \ No newline at end of file diff --git a/input/interface.py b/input/interface.py new file mode 100755 index 0000000000000000000000000000000000000000..59515b3a3a2a5361222b8e55d3a7314ab3907132 --- /dev/null +++ b/input/interface.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 + +""" +Interface for the Input-Package only this should be accessed from outside this Package. + +""" +from os import walk +import importlib +import pathlib +import re +from input.publication import Publication + +class InputInterface: + """ + Singleton which dynamically imports and manages fetchers + """ + + instance = None + get_path = None + fetcher_classes=[] + + # '__new__' is called before '__init__' and gives us an instance + def __new__(cls, *args, **kwargs): + + # checks if an instance exists and if it doesnt creates one + if cls.instance == None: + cls.instance = super(InputInterface, cls).__new__(cls,*args, **kwargs) + + return cls.instance + + def __init__(self): + # imports all modules + + if self.fetcher_classes ==[]: + self.import_fetcher_classes() + if self.fetcher_classes ==[]: + raise AttributeError("No specific Fetchers where found at: '{}'" + .format(self.get_path)) + + + def get_publication(self, url: str) -> Publication: + """ + The interface-method to get a Publication-instance + (including it's citations and references) + + Parameters + ---------- + :param url: url to a Publication + :type url: str + :return: Publication instance or None if not supported + """ + + # Checks if module supports the 'url' and + # returns a Publication if it does. + for fetcher_class in InputInterface.fetcher_classes: + if fetcher_class.can_use_url(url): + return fetcher_class.get_publication(url) + + # No Module for given url was found + raise ValueError("'{}' is not supported".format(url)) + + def get_pub_light(self, url: str) -> Publication: + """ + The interface-method to get a Publication-instance + (only for main article) + + Parameters + ---------- + :param url: url to a Publication + :type url: str + :return: Publication instance or None if not supported + """ + + # Checks if module supports the 'url' and + # returns a Publication if it does. + for fetcher_class in InputInterface.fetcher_classes: + if fetcher_class.can_use_url(url): + return fetcher_class.get_pub_light(url) + + # No Module for given url was found + raise ValueError("'{}' is not supported".format(url)) + + def get_supported_fetchers(self): + # print(self.fetcher_classes[0].__name__) Useless right now, + # because all classes are called the same + return [a.__name__ for a in self.fetcher_classes] + + def import_fetcher_classes(self): + """ + Searches in 'get', if there are [a-z]*.py modules (specific Fetchers) + and tries to import them. + Saves found modules in 'fetcher_files'. + """ + + # Path to 'get'-package + self.get_path = '{}/get'.format(pathlib.Path(__file__).parent.resolve()) + + # Searches for modules with given Pattern + fetcher_file_names=[] + for file in next(walk(self.get_path), (None, None, []))[2]: + if re.match(r'[a-z]+.py', file) is not None: + fetcher_file_names.append(file) + + # Tries to import those modules and saves their 'Fetcher'-class + for file in fetcher_file_names: + try: + fetcher_class = importlib.import_module("input.get.{}".format(file[:-3])) + try: + self.fetcher_classes.append(fetcher_class.__getattribute__('Fetcher')) + except Exception as error: + ImportError("Module '{}' does not have a 'Fetcher'-class".format(file[:-3])) + except Exception: + raise ImportError("Module '{}' can not be imported".format(file[:-3])) diff --git a/input/publication.py b/input/publication.py new file mode 100755 index 0000000000000000000000000000000000000000..fc512e7173a84695ea566706784c565a7b5ebb8f --- /dev/null +++ b/input/publication.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 + +# this is needed for typing pre python 3.9, this maybe as an large Overhead +from typing import Any, List + + +class Publication: + """ + Represents a Publications + """ + def __init__(self, doi_url: str, title: str \ + , contributors: List[str], journal: str \ + , publication_date: str, subjects: List[str]\ + , references: List[Any] = None, citations: List[Any] = None ): + """ + Parameters + ---------- + :param doi_url: doi_url of the publication + :type doi_url: str + :param title: title of the publication + :type title: str + :param contributors:list of all contributors + :type contributors: list[] + :param published: date of release + :type published: str + :param subjects: the subject of the Publication + :type subjects: List[str] + :param references: the Citation which is been referenced by this Publication + :type references: List[Any] + :param citations: the Citation which references this Publication + :type citations: List[Any] + :return: None + """ + self.doi_url = doi_url + self.title = title + self.contributors = contributors + self.journal = journal + self.publication_date = publication_date + self.subjects = subjects + if references is None: + self.references = [] + else: + self.references = references + if citations is None: + self.citations = [] + else: + self.citations = citations + + # For the 'Verarbeitungsgruppe' + self.group = None + + def __str__(self) -> str: + return ("Title: {}\n" + "Doi-url: {}\n" + "Authors: {}\n" + "Journal: {}\n" + "Published on: {}\n" + "Subjects: {}\n" + "References: \n{}\n" + "Citations: \n{}")\ + .format(self.title, self.doi_url, ", ".join(self.contributors) + , self.journal, self.publication_date + , ", ".join(self.subjects) + , "\n".join(self.get_citation_string(self.references)) + , "\n".join(self.get_citation_string(self.citations))) + + @staticmethod + def get_citation_string(citations): + if citations == []: + return ["None"] + else: + citation_string = [] + for citation in citations: + citation_string.append(citation.__str__()) + return citation_string + + def add_citations(self, citation) -> None: + """ + Appends a list of Citations or Citation to self.citations. + + Parameter + --------- + :param citation: Citation or Reference of the Publication + :type citation: Citation or list[Citation] + :return: self.citations + """ + if type(citation) is Citation: + self.citations.append(citation) + + # Checks if 'citation' is a list of Citations + elif type(citation) is list: + for _cit in citation: + if type(_cit) is Citation: + self.citations.append(_cit) + else: + raise TypeError("_set_citation expects Citations or List of Citations, not: '{}'" + .format(type(_cit))) + else: + raise TypeError("_set_citation expects Citations or List of Citations, not: '{}'" + .format(type(citation))) + + return self.citations + + def __eq__(self, other) -> bool: + """ Compares the unique doi_url of two Publications""" + if type(self)==type(other): + return self.doi_url == other.doi_url + return False + + +class Citation: + def __init__(self, doi_url: str, title: str \ + , journal: str, contributors: List[str] \ + , cit_type: str = "Citation"): + """ + Parameters + ---------- + :param doi_url: doi_url of the publication + :type doi_url: str + :param title: title of the publication + :type title: str + :param contributors: list of all contributors + :type contributors: List[str] + :param cit_type: Specifies if Reference or Citation + :type cit_type: str + :return: None + """ + + self.title = title + self.doi_url = doi_url + self.journal = journal + self.contributors = contributors + self.cit_type = cit_type + + def __str__(self) -> str: + return ("\t{}-Title: {}\n" + "\t{}-Doi: {}\n" + "\t{}-Journal: {}\n" + "\t{}-Contributors: {}\n")\ + .format(self.cit_type, self.title + , self.cit_type, self.doi_url + , self.cit_type, self.journal + , self.cit_type, ", ".join(self.contributors)) diff --git a/input/requirements.txt b/input/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..a151126691e7f0a9f1c824e9cbac243a96b32e71 --- /dev/null +++ b/input/requirements.txt @@ -0,0 +1,2 @@ +beautifulsoup4 +requests \ No newline at end of file diff --git a/input/test/__init__.py b/input/test/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/input/test/test_acs.py b/input/test/test_acs.py new file mode 100644 index 0000000000000000000000000000000000000000..e3dfe84a09d3599de32efbab0dd60655b5414152 --- /dev/null +++ b/input/test/test_acs.py @@ -0,0 +1,303 @@ +#!/usr/bin/env python + +from input.get.acs import Fetcher as Acs +from input.publication import Publication, Citation +from input.test.test_input import FetcherTestCase + + +class AcsTestCase(FetcherTestCase): + """ + Methods with test_* will be detected by unittest and run. + """ + + def test_acs_url(self): + # Positive Testing + self.can_use_url_test(Acs, "https://doi.org/10.1021/acs.jcim.1c00203" , True) + self.can_use_url_test(Acs, "doi.org/10.1021/acs.jcim.1c00203" , True) + self.can_use_url_test(Acs, "10.1021/acs.jcim.1c00203" , True) + self.can_use_url_test(Acs, " 10.1021/acs.jcim.1c00203" , True) + self.can_use_url_test(Acs, "10.1021/acs.jcim.1c00203 " , True) + self.can_use_url_test(Acs, "\t 10.1021/acs.jcim.1c00203 \t\n" , True) + self.can_use_url_test(Acs, "https://pubs.acs.org/doi/10.1021/acs.jcim.1c00203" , True) + + # Negative Testing + self.can_use_url_test(Acs, "" , False) + self.can_use_url_test(Acs, "https://doi.org/10.1038/219021a0" , False) + self.can_use_url_test(Acs, "https://www.nature.com/articles/219021a0" , False) + self.can_use_url_test(Acs, "https://pubs.acs.org/doi/doi.org/10.1021/acs.jcim.1c00203", False) + + + + def test_acs_publication(self): + url = "https://doi.org/10.1021/acs.jcim.1c00203" + self.get_publication_test(Acs, url, self.expectedPubs[url]) + + def test_acs_exceptions(self): + test_url= "https://doi.org/10.1021/acs.jcim.1c002" + self.get_publication_exception_test(Acs, test_url) + + # Dictionary of Expected Results, with url + expectedPubs = { + "https://doi.org/10.1021/acs.jcim.1c00203": + Publication( + doi_url = "https://doi.org/10.1021/acs.jcim.1c00203", + title = "AutoDock Vina 1.2.0: New Docking Methods, Expanded Force Field, and Python Bindings", + contributors = ["Jerome Eberhardt", "Diogo Santos-Martins", "Andreas F. Tillack", "Stefano Forli"], + journal="Journal of Chemical Information and Modeling", + publication_date = "July 19, 2021", + subjects = ["Algorithms","Ligands","Molecules","Receptors","Macrocycles"], + references = [ + Citation(doi_url = "https://doi.org/10.1002/jcc.21334" + , title ="AutoDock Vina: improving the speed and accuracy of docking with a new scoring function, efficient optimization, and multithreading" + , journal="Journal of Computational Chemistry" + , contributors=["Trott, O.", "Olson, A. J."] + , cit_type="Reference") + , Citation(doi_url = "https://doi.org/10.1038/nprot.2016.051" + , title ="Computational protein-ligand docking and virtual drug screening with the AutoDock suite" + , journal="Journal of Natural Products" + , contributors=["Forli, S.","Huey, R.","Pique, M. E.","Sanner, M. F.","Goodsell, D. S.","Olson, A. J."] + , cit_type="Reference") + , Citation(title = "A semiempirical free energy force field with charge-based desolvation" + , doi_url = "https://doi.org/10.1002/jcc.20634" + , journal="Journal of Computational Chemistry" + , contributors=["Huey, R.","Morris, G. M.","Olson, A. J.","Goodsell, D. S."] + , cit_type="Reference") + , Citation(title="Accelerating autodock4 with gpus and gradient-based local search" + , doi_url="https://doi.org/10.1021/acs.jctc.0c01006" + , journal="Journal of Chemical Theory and Computation" + , contributors=["Santos-Martins, D.","Solis-Vasquez, L.","Tillack, A. F.","Sanner, M. F.","Koch, A.","Forli, S."] + , cit_type="Reference") + , Citation(title="AutoDockFR: Advances in Protein-Ligand Docking with Explicitly Specified Binding Site Flexibility" + , doi_url="https://doi.org/10.1371/journal.pcbi.1004586" + , journal="PLoS Computational Biology" + , contributors=["Ravindranath, P. A.","Forli, S.","Goodsell, D. S.","Olson, A. J.","Sanner, M. F."] + , cit_type="Reference") + , Citation(title="Docking flexible cyclic peptides with AutoDock CrankPep" + , doi_url="https://doi.org/10.1021/acs.jctc.9b00557" + , journal="Journal of Chemical Theory and Computation" + , contributors=["Zhang, Y.","Sanner, M. F."] + , cit_type="Reference") + , Citation(title="Fast, accurate, and reliable molecular docking with QuickVina 2" + , doi_url="https://doi.org/10.1093/bioinformatics/btv082" + , journal="Bioinformatics" + , contributors=["Alhossary, A.","Handoko, S. D.","Mu, Y.","Kwoh, C.-K."] + , cit_type="Reference") + , Citation(title="Lessons learned in empirical scoring with smina from the CSAR 2011 benchmarking exercise" + , doi_url="https://doi.org/10.1021/ci300604z" + , journal="Journal of Chemical Information and Modeling" + , contributors=["Koes, D. R.","Baumgartner, M. P.","Camacho, C. J."] + , cit_type="Reference") + , Citation(title="Vina-Carb: Improving Glycosidic Angles during Carbohydrate Docking" + , doi_url="https://doi.org/10.1021/acs.jctc.5b00834" + , journal="Journal of Chemical Theory and Computation" + , contributors=["Nivedha, A. K.","Thieker, D. F.","Makeneni, S.","Hu, H.","Woods, R. J."] + , cit_type="Reference") + , Citation(title="AutoDock VinaXB: implementation of XBSF, new empirical halogen bond scoring function, into AutoDock Vina" + , doi_url="https://doi.org/10.1186/s13321-016-0139-1" + , journal="Journal of Cheminformatics" + , contributors=["Koebel, M. R.","Schmadeke, G.","Posner, R. G.","Sirimulla, S."] + , cit_type="Reference") + , Citation(title="Vinardo: A Scoring Function Based on Autodock Vina Improves Scoring, Docking, and Virtual Screening" + , doi_url="https://doi.org/10.1371/journal.pone.0155183" + , journal="PLoS One" + , contributors=["Quiroga, R.","Villarreal, M. A."] + , cit_type="Reference") + , Citation(title="Lennard-Jones potential and dummy atom settings to overcome the AUTODOCK limitation in treating flexible ring systems" + , doi_url="https://doi.org/10.1021/ci700036j" + , journal="Journal of Chemical Information and Modeling" + , contributors=["Forli, S.","Botta, M."] + , cit_type="Reference") + , Citation(title="AutoDock4Zn: an improved AutoDock force field for small-molecule docking to zinc metalloproteins" + , doi_url="https://doi.org/10.1021/ci500209e" + , journal="Journal of Chemical Information and Modeling" + , contributors=["Santos-Martins, D.","Forli, S.","Ramos, M. J.","Olson, A. J."] + , cit_type="Reference") + , Citation(title="A force field with discrete displaceable waters and desolvation entropy for hydrated ligand docking" + , doi_url="https://doi.org/10.1021/jm2005145" + , journal="Journal of Medicinal Chemistry" + , contributors=["Forli, S.","Olson, A. J."] + , cit_type="Reference") + , Citation(title="Directional phosphorylation and nuclear transport of the splicing factor SRSF1 is regulated by an RNA recognition motif" + , doi_url="https://doi.org/10.1016/j.jmb.2016.04.009" + , journal="Journal of Molecular Biology" + , contributors=["Serrano, P.","Aubol, B. E.","Keshwani, M. M.","Forli, S.","Ma, C.-T.","Dutta, S. K.","Geralt, M.","Wüthrich, K.","Adams, J. A."] + , cit_type="Reference") + , Citation(title="Covalent docking using autodock: Two-point attractor and flexible side chain methods" + , doi_url="https://doi.org/10.1002/pro.2733" + , journal="Protein Science" + , contributors=["Bianco, G.","Forli, S.","Goodsell, D. S.","Olson, A. J."] + , cit_type="Reference") + , Citation(title="Consensus docking: improving the reliability of docking in a virtual screening context" + , doi_url="https://doi.org/10.1021/ci300399w" + , journal="Journal of Chemical Information and Modeling" + , contributors=["Houston, D. R.","Walkinshaw, M. D."] + , cit_type="Reference") + , Citation(title="DockBench: an integrated informatic platform bridging the gap between the robust validation of docking protocols and virtual screening simulations" + , doi_url="https://doi.org/10.3390/molecules20069977" + , journal="Molecules" + , contributors=["Cuzzolin, A.","Sturlese, M.","Malvacio, I.","Ciancetta, A.","Moro, S."] + , cit_type="Reference") + , Citation(title="A new force field for molecular mechanical simulation of nucleic acids and proteins" + , doi_url="https://doi.org/10.1021/ja00315a051" + , journal="Journal of the American Chemical Society" + , contributors=["Weiner, S. J.","Kollman, P. A.","Case, D. A.","Singh, U. C.","Ghio, C.","Alagona, G.","Profeta, S.","Weiner, P."] + , cit_type="Reference") + , Citation(title="AutoDock Bias: improving binding mode prediction and virtual screening using known protein-ligand interactions" + , doi_url="https://doi.org/10.1093/bioinformatics/btz152" + , journal="Bioinformatics" + , contributors=["Arcon, J. P.","Modenutti, C. P.","Avendaño, D.","Lopez, E. D.","Defelipe, L. A.","Ambrosio, F. A.","Turjanski, A. G.","Forli, S.","Marti, M. A."] + , cit_type="Reference") + , Citation(title="Inhomogeneous Fluid Approach to Solvation Thermodynamics. 1. Theory" + , doi_url="https://doi.org/10.1021/jp9723574" + , journal="Journal of Physical Chemistry B" + , contributors=["Lazaridis, T."] + , cit_type="Reference") + , Citation(title="Inhomogeneous fluid approach to solvation thermodynamics. 2. Applications to simple fluids" + , doi_url="https://doi.org/10.1021/jp972358w" + , journal="Journal of Physical Chemistry B" + , contributors=["Lazaridis, T."] + , cit_type="Reference") + , Citation(title="Grid inhomogeneous solvation theory: Hydration structure and thermodynamics of the miniature receptor cucurbit[7]uril" + , doi_url="https://doi.org/10.1063/1.4733951" + , journal="Journal of Chemical Physics" + , contributors=["Nguyen, C. N.","Young, T. K.","Gilson, M. K."] + , cit_type="Reference") + , Citation(title="AutoDock-GIST: Incorporating Thermodynamics of Active-Site Water into Scoring Function for Accurate Protein-Ligand Docking" + , doi_url="https://doi.org/10.3390/molecules21111604" + , journal="Molecules" + , contributors=["Uehara, S.","Tanaka, S."] + , cit_type="Reference") + , Citation(title="ZINC20—A Free Ultralarge-Scale Chemical Database for Ligand Discovery" + , doi_url="https://doi.org/10.1021/acs.jcim.0c00675" + , journal="Journal of Chemical Information and Modeling" + , contributors=["Irwin, J. J.","Tang, K. G.","Young, J.","Dandarchuluun, C.","Wong, B. R.","Khurelbaatar, M.","Moroz, Y. S.","Mayfield, J.","Sayle, R. A."] + , cit_type="Reference") + , Citation(title="Structural biology-inspired discovery of novel KRAS–PDEδ inhibitors" + , doi_url="https://doi.org/10.1021/acs.jmedchem.7b01243" + , journal="Journal of Medicinal Chemistry" + , contributors=["Jiang, Y.","Zhuang, C.","Chen, L.","Lu, J.","Dong, G.","Miao, Z.","Zhang, W.","Li, J.","Sheng, C."] + , cit_type="Reference") + , Citation(title="D3R grand challenge 2015: evaluation of protein–ligand pose and affinity predictions" + , doi_url="https://doi.org/10.1007/s10822-016-9946-8" + , journal="Journal of Computer-Aided Molecular Design" + , contributors=["Gathiaka, S.","Liu, S.","Chiu, M.","Yang, H.","Stuckey, J. A.","Kang, Y. N.","Delproposto, J.","Kubish, G.","Dunbar, J. B.","Carlson, H. A.","Burley, S. K.","Walters, W. P.","Amaro, R. E.","Feher, V. A.","Gilson, M. K."] + , cit_type="Reference") + , Citation(title="D3R grand challenge 4: blind prediction of protein–ligand poses, affinity rankings, and relative binding free energies" + , doi_url="https://doi.org/10.1007/s10822-020-00289-y" + , journal="Journal of Computer-Aided Molecular Design" + , contributors=["Parks, C. D.","Gaieb, Z.","Chiu, M.","Yang, H.","Shao, C.","Walters, W. P.","Jansen, J. M.","McGaughey, G.","Lewis, R. A.","Bembenek, S. D.","Ameriks, M. K.","Mirzadegan, T.","Burley, S. K.","Amaro, R. E.","Gilson, M. K."] + , cit_type="Reference") + , Citation(title="D3R Grand Challenge 4: prospective pose prediction of BACE1 ligands with AutoDock-GPU" + , doi_url="https://doi.org/10.1007/s10822-019-00241-9" + , journal="Journal of Computer-Aided Molecular Design" + , contributors=["Santos-Martins, D.","Eberhardt, J.","Bianco, G.","Solis-Vasquez, L.","Ambrosio, F. A.","Koch, A.","Forli, S."] + , cit_type="Reference") + , Citation(title="Comparison of affinity ranking using AutoDock-GPU and MM-GBSA scores for BACE-1 inhibitors in the D3R Grand Challenge 4" + , doi_url="https://doi.org/10.1007/s10822-019-00240-w" + , journal="Journal of Computer-Aided Molecular Design" + , contributors=["El Khoury, L.","Santos-Martins, D.","Sasmal, S.","Eberhardt, J.","Bianco, G.","Ambrosio, F. A.","Solis-Vasquez, L.","Koch, A.","Forli, S.","Mobley, D. L."] + , cit_type="Reference") + , Citation(title="Macrocycle modeling in ICM: benchmarking and evaluation in D3R Grand Challenge 4" + , doi_url="https://doi.org/10.1007/s10822-019-00225-9" + , journal="Journal of Computer-Aided Molecular Design" + , contributors=["Lam, P. C.-H.","Abagyan, R.","Totrov, M."] + , cit_type="Reference") + , Citation(title="Directory of useful decoys, enhanced (DUD-E): better ligands and decoys for better benchmarking" + , doi_url="https://doi.org/10.1021/jm300687e" + , journal="Journal of Medicinal Chemistry" + , contributors=["Mysinger, M. M.","Carchia, M.","Irwin, J. J.","Shoichet, B. K."] + , cit_type="Reference") + , Citation(title="Evaluation of AutoDock and AutoDock Vina on the CASF-2013 benchmark" + , doi_url="https://doi.org/10.1021/acs.jcim.8b00312" + , journal="Journal of Chemical Information and Modeling" + , contributors=["Gaillard, T."] + , cit_type="Reference") + , Citation(title="Autodock vina adopts more accurate binding poses but autodock4 forms better binding affinity" + , doi_url="https://doi.org/10.1021/acs.jcim.9b00778" + , journal="Journal of Chemical Information and Modeling" + , contributors=["Nguyen, N. T.","Nguyen, T. H.","Pham, T. N. H.","Huy, N. T.","Bay, M. V.","Pham, M. Q.","Nam, P. C.","Vu, V. V.","Ngo, S. T."] + , cit_type="Reference") + , Citation(title="Development and validation of a genetic algorithm for flexible docking" + , doi_url="https://doi.org/10.1006/jmbi.1996.0897" + , journal="Journal of Molecular Biology" + , contributors=["Jones, G.","Willett, P.","Glen, R. C.","Leach, A. R.","Taylor, R."] + , cit_type="Reference") + , Citation(title="Glide: a new approach for rapid, accurate docking and scoring. 1. Method and assessment of docking accuracy" + , doi_url="https://doi.org/10.1021/jm0306430" + , journal="Journal of Medicinal Chemistry" + , contributors=["Friesner, R. A.","Banks, J. L.","Murphy, R. B.","Halgren, T. A.","Klicic, J. J.","Mainz, D. T.","Repasky, M. P.","Knoll, E. H.","Shelley, M.","Perry, J. K."] + , cit_type="Reference") + , Citation(title="Surflex: fully automatic flexible molecular docking using a molecular similarity-based search engine" + , doi_url="https://doi.org/10.1021/jm020406h" + , journal="Journal of Medicinal Chemistry" + , contributors=["Jain, A. N."] + , cit_type="Reference") + , Citation(title="A fast flexible docking method using an incremental construction algorithm" + , doi_url="https://doi.org/10.1006/jmbi.1996.0477" + , journal="Journal of Molecular Biology" + , contributors=["Rarey, M.","Kramer, B.","Lengauer, T.","Klebe, G."] + , cit_type="Reference") + , Citation(title="EDock: blind protein–ligand docking by replica-exchange monte carlo simulation" + , doi_url="https://doi.org/10.1186/s13321-020-00440-9" + , journal="Journal of Cheminformatics" + , contributors=["Zhang, W.","Bell, E. W.","Yin, M.","Zhang, Y."] + , cit_type="Reference") + , Citation(title="DOCK 6: Impact of new features and current docking performance" + , doi_url="https://doi.org/10.1002/jcc.23905" + , journal="Journal of Computational Chemistry" + , contributors=["Allen, W. J.","Balius, T. E.","Mukherjee, S.","Brozell, S. R.","Moustakas, D. T.","Lang, P. T.","Case, D. A.","Kuntz, I. D.","Rizzo, R. C."] + , cit_type="Reference") + , Citation(title="Improving scoring-docking-screening powers of protein–ligand scoring functions using random forest" + , doi_url="https://doi.org/10.1002/jcc.24667" + , journal="Journal of Computational Chemistry" + , contributors=["Wang, C.","Zhang, Y."] + , cit_type="Reference") + , Citation(title="ID-Score: a new empirical scoring function based on a comprehensive set of descriptors related to protein–ligand interactions" + , doi_url="https://doi.org/10.1021/ci300493w" + , journal="Journal of Chemical Information and Modeling" + , contributors=["Li, G.-B.","Yang, L.-L.","Wang, W.-J.","Li, L.-L.","Yang, S.-Y."] + , cit_type="Reference") + , Citation(title="Further development and validation of empirical scoring functions for structure-based binding affinity prediction" + , doi_url="https://doi.org/10.1023/a:1016357811882" + , journal="Journal of Computer-Aided Molecular Design" + , contributors=["Wang, R.","Lai, L.","Wang, S."] + , cit_type="Reference") + , Citation(title="A knowledge-based energy function for protein- ligand, protein- protein, and protein- DNA complexes" + , doi_url="https://doi.org/10.1021/jm049314d" + , journal="Journal of Medicinal Chemistry" + , contributors=["Zhang, C.","Liu, S.","Zhu, Q.","Zhou, Y."] + , cit_type="Reference") + , Citation(title="DLIGAND2: an improved knowledge-based energy function for protein–ligand interactions using the distance-scaled, finite, ideal-gas reference state" + , doi_url="https://doi.org/10.1186/s13321-019-0373-4" + , journal="Journal of Cheminformatics" + , contributors=["Chen, P.","Ke, Y.","Lu, Y.","Du, Y.","Li, J.","Yan, H.","Zhao, H.","Zhou, Y.","Yang, Y."] + , cit_type="Reference") + , Citation(title="Comparing AutoDock and Vina in ligand/decoy discrimination for virtual screening" + , doi_url="https://doi.org/10.3390/app9214538" + , journal="Applied Science" + , contributors=["Vieira, T. F.","Sousa, S. F."] + , cit_type="Reference") + , Citation(title="Benchmark of four popular virtual screening programs: construction of the active/decoy dataset remains a major determinant of measured performance" + , doi_url="https://doi.org/10.1186/s13321-016-0167-x" + , journal="Journal of Cheminformatics" + , contributors=["Chaput, L.","Martinez-Sanz, J.","Quiniou, E.","Rigolet, P.","Saettel, N.","Mouawad, L."] + , cit_type="Reference") + , Citation(title="Array programming with NumPy" + , doi_url="https://doi.org/10.1038/s41586-020-2649-2" + , journal="Nature" + , contributors=["Harris, C. R."] + , cit_type="Reference") + , Citation(title="Matplotlib: A 2D graphics environment" + , doi_url="https://doi.org/10.1109/mcse.2007.55" + , journal="Computing in Science & Engineering" + , contributors=["Hunter, J. D."] + , cit_type="Reference") + ], citations = [ + Citation(doi_url = "https://doi.org/10.1021/acsomega.1c04320" + , title ="Novel Anti-Hepatitis B Virus Activity of Euphorbia schimperi and Its Quercetin and Kaempferol Derivatives" + , journal="ACS Omega" + , contributors=["Mohammad K. Parvez","Sarfaraz Ahmed","Mohammed S. Al-Dosari","Mazin A. S. Abdelwahid","Ahmed H. Arbab","Adnan J. Al-Rehaily","Mai M. Al-Oqail"],cit_type="Citation"), + + ] + ) + } \ No newline at end of file diff --git a/input/test/test_input.py b/input/test/test_input.py new file mode 100755 index 0000000000000000000000000000000000000000..b2ca55f961565fd1192b72ce992c9ff95bd23020 --- /dev/null +++ b/input/test/test_input.py @@ -0,0 +1,82 @@ +import unittest +from input.get.journal_fetcher import JournalFetcher +from input.interface import InputInterface +from input.publication import Publication + +""" +Testing the Publication fetcher + +Publication 1: 'https://doi.org/10.1021/acs.jcim.1c00203' +Publication 2: 'doi.org/10.1021/acs.jcim.1c00917' +Publication 3: '10.1038/nchem.1781' +Publication 4: '11.12/jaj' +Publication 5: '11.12/' +Publication 6: 'https://doi.org/10.1021/acs.jmedchem.0c01332' # Paper is a PDF +""" +# TODO: Testcases for: +# - Specific Journals: Inherit from FetcherTestCase +# - interface module-importer (test case) +# - Error detection +# - wrong/no Journal_fetchers +# - wrong urls +# - correct Types in publication +# - Edgecases (i.e. paper as pdf, no connection, etc) + + +class InterfaceTestCase(unittest.TestCase): + def setUp(self): + self.assertEqual(InputInterface.instance, None) + self.interface = InputInterface() + + def test_singleton(self): + # interface should already be made in setUp() + self.assertNotEqual(self.interface.instance, None) + new_interface = InputInterface() + self.assertEqual(self.interface, new_interface) + + # def test_imported_modules(self): + # fetchers = self.interface.get_supported_fetchers + +class FetcherTestCase(unittest.TestCase): + + + def can_use_url_test(self, fetcher : JournalFetcher, test_url: str, expected_res: bool): + # Tests the 'can_use_url'-method + self.assertEqual(fetcher.can_use_url(test_url), expected_res) + + + def get_publication_test(self, fetcher : JournalFetcher, test_url: str, expected_res: Publication): + """ + this test asserts that every variable is equals to the expected result + """ + actual_res = fetcher.get_publication(test_url) + self.assertEqual(actual_res.doi_url, expected_res.doi_url) + self.assertEqual(actual_res.title, expected_res.title) + self.assertEqual(actual_res.contributors, expected_res.contributors) + self.assertEqual(actual_res.journal, expected_res.journal) + self.assertEqual(actual_res.publication_date, expected_res.publication_date) + self.assertEqual(actual_res.subjects, expected_res.subjects) + + # Checking for all references + self.assertEqual(len(actual_res.references), len(expected_res.references)) + num_references = len(expected_res.references) + for i in range(num_references): + self.assertEqual(actual_res.references[i].doi_url, expected_res.references[i].doi_url) + self.assertEqual(actual_res.references[i].journal, expected_res.references[i].journal) + self.assertEqual(actual_res.references[i].contributors, expected_res.references[i].contributors) + self.assertEqual(actual_res.references[i].cit_type, expected_res.references[i].cit_type) + + # Checking for all citations + self.assertEqual(len(actual_res.citations), len(expected_res.citations)) + num_citations = len(expected_res.citations) + for i in range(num_citations): + self.assertEqual(actual_res.citations[i].doi_url, expected_res.citations[i].doi_url) + self.assertEqual(actual_res.citations[i].journal, expected_res.citations[i].journal) + self.assertEqual(actual_res.citations[i].contributors, expected_res.citations[i].contributors) + self.assertEqual(actual_res.citations[i].cit_type, expected_res.citations[i].cit_type) + + + def get_publication_exception_test(self, fetcher: JournalFetcher, test_url: str): + # Ckecks + with self.assertRaises(ValueError): + fetcher.get_publication(test_url) \ No newline at end of file diff --git a/input_old/README.md b/input_old/README.md new file mode 100644 index 0000000000000000000000000000000000000000..76bd11d5d70daac13e190f4d52269eb381413c69 --- /dev/null +++ b/input_old/README.md @@ -0,0 +1,3 @@ +# Projekt CiS-Projekt 2021/22 +Input-Skripts + diff --git a/input_old/__pycache__/input_fj.cpython-39.pyc b/input_old/__pycache__/input_fj.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a3e6099f4ab4c56400b2698c812d4b5fc9a9a7aa Binary files /dev/null and b/input_old/__pycache__/input_fj.cpython-39.pyc differ diff --git a/input_old/example_urls b/input_old/example_urls new file mode 100644 index 0000000000000000000000000000000000000000..96ac680c65edddcb495312000157edea1ab94884 --- /dev/null +++ b/input_old/example_urls @@ -0,0 +1,2 @@ +https://pubs.acs.org/doi/10.1021/acs.jcim.5b00332 +https://pubs.acs.org/doi/10.1021/acs.jcim.6b00709 diff --git a/input_old/input_fj.py b/input_old/input_fj.py new file mode 100644 index 0000000000000000000000000000000000000000..ecc8e68fc5a84a446ae3f09dcb5ed56e8d262766 --- /dev/null +++ b/input_old/input_fj.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +""" +Functions for information retrieval of articles from the ACS journal JCIM + +""" + +__author__ = "Florian Jochens" +__email__ = "fj@andaco.de" +__status__ = "Production" +#__copyright__ = "" +#__credits__ = ["", "", "", ""] +#__license__ = "" +#__version__ = "" +#__maintainer__ = "" + +from bs4 import BeautifulSoup as bs +import requests as req +import sys +from pathlib import Path + +class Publication: + #_registry = [] + _citations = [] + _references = [] + + def __init__(self, title, publication_date, contributors, doi_url, + subjects = None, num_citations = None): + #self._registry.append(self) + self.title = title + self.publication_date = publication_date + self.contributors = contributors + self.doi_url = doi_url + self.subjects = subjects + self.num_citations = num_citations + #self._citations = [] + #self._references = [] + +class Citation: + def __init__(self, title, journal, contributors, doi_url): + self.title = title + self.journal = journal + self.contributors = contributors + self.doi_url = doi_url + +class References: + def __init__(self, title, journal, contributors, doi_url): + self.title = title + self.journal = journal + self.contributors = contributors + self.doi_url = doi_url + +def get_article_info(soup): + header = soup.find('div', class_ = 'article_header-left pull-left') + article_title = header.find('span', class_ = 'hlFld-Title').text + publication_date = header.find('span', class_ = 'pub-date-value').text + for link in header.find('div', class_ = 'article_header-doiurl'): + doi_url = link.get('href') + subs = header.find('div', class_ = 'article_header-taxonomy') + subjects = [] + for sub in subs.find_all('a'): + subjects.append(sub.get('title')) + cons = header.find('ul', class_ = 'loa') + contributors = [] + for con in cons.find_all('span', class_ = 'hlFld-ContribAuthor'): + contributors.append(con.text) + numc = header.find('div', class_ = 'articleMetrics_count') + if not numc.a: + num_citations = 0 + else: + num_citations = numc.a.text + + pub = Publication(article_title, publication_date, contributors, doi_url, + subjects, num_citations) + return pub + +def get_download_url(): + export = soup.find('div', class_ = 'cit-download-dropdown_content') + url = 'https://pubs.acs.org' + for link in export.find_all('a'): + if link.get('title') == 'Citation and references': + url += link.get('href') + print(url) + return url + +def download(url): # Download citation and references file + if url.find('='): + filename = url.rsplit('=', 1)[1] + path = Path(('./files/' + filename)) + if path.is_file(): + print("File already exists") + else: + print("File does not exist") + +def get_citation_info(pub, num_citations, soup): + pub._citations = [] + details = soup.find('ol', class_ = 'cited-content_cbyCitation') + titles = [] + for title in details.find_all('span', + class_ = 'cited-content_cbyCitation_article-title'): + titles.append(title.text.replace('.', '')) + journal_names = [] + for name in details.find_all('span', + class_ = 'cited-content_cbyCitation_journal-name'): + journal_names.append(name.text) + doi_urls = [] + for url in details.find_all('a'): + doi_urls.append(url.get('href')) + contributors = [] + for contrib in details.find_all('span', + class_ = 'cited-content_cbyCitation_article-contributors'): + contributors.append(contrib.text) + for i in range(0, int(num_citations)): + pub._citations.append(Citation(titles[i], journal_names[i], + contributors[i], doi_urls[i])) +def print_pub_info(pub): + print(f'''Article title: {pub.title} +Publication date: {pub.publication_date} +DOI-URL: {pub.doi_url} + +Subjects:''') + print(*(pub.subjects), sep = ", ") + print('\nContributors:') + print(*(pub.contributors), sep = ", ") + + if int(pub.num_citations) > 0: + if int(pub.num_citations) == 1: + print(f'\nThis publication is cited by the following publication:\n') + else: + print(f'\nThis publication is cited by the following {pub.num_citations} publications:\n') + for citation in pub._citations: + print(f''' + Title: {citation.title} + Journal: {citation.journal} + Contributors: {citation.contributors} + DOI-URL: {citation.doi_url} + ''') + else: + print('\nThis publication is not cited by any other publication.') + +def input(url): + html_text = req.get(url).text + soup = bs(html_text, 'html.parser') + + pub = get_article_info(soup) + if int(pub.num_citations) > 0: + get_citation_info(pub, int(pub.num_citations), soup) + return pub + +#if len(sys.argv) != 2: +# sys.stderr.write('Usage: {} <url>\n'.format(sys.argv[0])) +# exit(1) +#url = sys.argv[1] +#pub = input(url) +#print_pub_info(pub) diff --git a/input_old/pub.py b/input_old/pub.py new file mode 100644 index 0000000000000000000000000000000000000000..13b90e804cd485813b731385b319b3077a017dd2 --- /dev/null +++ b/input_old/pub.py @@ -0,0 +1,32 @@ +class Publication: + #_registry = [] + #_citations = [] + #_references = [] + + def __init__(self, title, publication_date, contributors, doi_url, + subjects, num_citations): + #self._registry.append(self) + self.title = title + self.publication_date = publication_date + self.contributors = contributors + self.doi_url = doi_url + self.subjects = subjects + self.num_citations = num_citations + self.num_references = num_references + self._citations = [] + self._references = [] + +class Citation: + def __init__(self, title, journal, contributors, doi_url): + self.title = title + self.journal = journal + self.contributors = contributors + self.doi_url = doi_url + +class References: + def __init__(self, title, journal, contributors, doi_url): + self.title = title + self.journal = journal + self.contributors = contributors + self.doi_url = doi_url + diff --git a/input_old/test.py b/input_old/test.py new file mode 100755 index 0000000000000000000000000000000000000000..dc623ca182691e9e06a6713a4d3d5dcf0bbf23c2 --- /dev/null +++ b/input_old/test.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 + +from input_fj import input, print_pub_info +import sys + +if len(sys.argv) != 3: + sys.stderr.write('Usage: {} <url> <url>\n'.format(sys.argv[0])) + exit(1) +url = sys.argv[1] +url2 = sys.argv[2] +pub = input(url) +print_pub_info(pub) +pub2 = input(url2) +print_pub_info(pub2) + diff --git a/input_old/x b/input_old/x new file mode 100644 index 0000000000000000000000000000000000000000..c8ade9d56a520a3ac57e5eadce8b81bb3e63c0dd --- /dev/null +++ b/input_old/x @@ -0,0 +1,234 @@ +Article title: Feasibility of Active Machine Learning for Multiclass Compound Classification +Publication date: January 7, 2016 +DOI-URL: https://doi.org/10.1021/acs.jcim.5b00332 + +Subjects: +Algorithms, Molecules, Drug discovery, Screening assays, Receptors + +Contributors: +Tobias Lang, Florian Flachsenberg, Ulrike von Luxburg, Matthias Rarey + +This publication is cited by the following 30 publications: + + + Title: Concepts of Artificial Intelligence for Computer-Assisted Drug Discovery + Journal: Chemical Reviews + Contributors: Xin Yang, Yifei Wang, Ryan Byrne, Gisbert Schneider, Shengyong Yang. + DOI-URL: https://doi.org/10.1021/acs.chemrev.8b00728 + + + Title: De Novo Molecule Design by Translating from Reduced Graphs to SMILES + Journal: Journal of Chemical Information and Modeling + Contributors: Peter Pogány, Navot Arad, Sam Genway, Stephen D. Pickett. + DOI-URL: https://doi.org/10.1021/acs.jcim.8b00626 + + + Title: Designing Algorithms To Aid Discovery by Chemical Robots + Journal: ACS Central Science + Contributors: Alon B. Henson, Piotr S. Gromski, Leroy Cronin. + DOI-URL: https://doi.org/10.1021/acscentsci.8b00176 + + + Title: Modeling Kinase Inhibition Using Highly Confident Data Sets + Journal: Journal of Chemical Information and Modeling + Contributors: Sorin Avram, Alina Bora, Liliana Halip, Ramona Curpăn. + DOI-URL: https://doi.org/10.1021/acs.jcim.7b00729 + + + Title: Predictive Models for Fast and Effective Profiling of Kinase Inhibitors + Journal: Journal of Chemical Information and Modeling + Contributors: Alina Bora, Sorin Avram, Ionel Ciucanu, Marius Raica, and Stefana Avram . + DOI-URL: https://doi.org/10.1021/acs.jcim.5b00646 + + + Title: Evaluation of categorical matrix completion algorithms: toward improved active learning for drug discovery + Journal: Bioinformatics + Contributors: Huangqingbo Sun, Robert F Murphy, . + DOI-URL: https://doi.org/10.1093/bioinformatics/btab322 + + + Title: An Artificial Intelligence Approach Based on Hybrid CNN-XGB Model to Achieve High Prediction Accuracy through Feature Extraction, Classification and Regression for Enhancing Drug Discovery in Biomedicine + Journal: International Journal of Biology and Biomedical Engineering + Contributors: Mukesh Madanan, Biju T. Sayed, Nurul Akhmal Mohd Zulkefli, Nitha C. Velayudhan. + DOI-URL: https://doi.org/10.46300/91011.2021.15.22 + + + Title: Artificial Intelligence in Medicinal Chemistry + Journal: + Contributors: Edward Griffen, Alexander Dossetter, Andrew Leach, Shane Montague. + DOI-URL: https://doi.org/10.1002/0471266949.bmc267 + + + Title: Practical Chemogenomic Modeling and Molecule Discovery Strategies Unveiled by Active Learning + Journal: + Contributors: J.B. Brown. + DOI-URL: https://doi.org/10.1016/B978-0-12-801238-3.11533-8 + + + Title: Machine learning phases and criticalities without using real data for training + Journal: Physical Review B + Contributors: D.-R. Tan, F.-J. Jiang. + DOI-URL: https://doi.org/10.1103/PhysRevB.102.224434 + + + Title: Active learning effectively identifies a minimal set of maximally informative and asymptotically performant cytotoxic structure–activity patterns in NCI-60 cell lines + Journal: RSC Medicinal Chemistry + Contributors: Takumi Nakano, Shunichi Takeda, J.B. Brown. + DOI-URL: https://doi.org/10.1039/D0MD00110D + + + Title: Active learning efficiently converges on rational limits of toxicity prediction and identifies patterns for molecule design + Journal: Computational Toxicology + Contributors: Ahsan Habib Polash, Takumi Nakano, Christin Rakers, Shunichi Takeda, J.B. Brown. + DOI-URL: https://doi.org/10.1016/j.comtox.2020.100129 + + + Title: Practical considerations for active machine learning in drug discovery + Journal: Drug Discovery Today: Technologies + Contributors: Daniel Reker. + DOI-URL: https://doi.org/10.1016/j.ddtec.2020.06.001 + + + Title: Designing compact training sets for data-driven molecular property prediction through optimal exploitation and exploration + Journal: Molecular Systems Design & Engineering + Contributors: Bowen Li, Srinivas Rangarajan. + DOI-URL: https://doi.org/10.1039/C9ME00078J + + + Title: Applicability Domain of Active Learning in Chemical Probe Identification: Convergence in Learning from Non-Specific Compounds and Decision Rule Clarification + Journal: Molecules + Contributors: Ahsan Habib Polash, Takumi Nakano, Shunichi Takeda, J.B. Brown. + DOI-URL: https://doi.org/10.3390/molecules24152716 + + + Title: Capturing and applying knowledge to guide compound optimisation + Journal: Drug Discovery Today + Contributors: Matthew Segall, Tamsin Mansley, Peter Hunt, Edmund Champness. + DOI-URL: https://doi.org/10.1016/j.drudis.2019.02.004 + + + Title: A novel graph kernel on chemical compound classification + Journal: Journal of Bioinformatics and Computational Biology + Contributors: Qiangrong Jiang, Jiajia Ma. + DOI-URL: https://doi.org/10.1142/S0219720018500269 + + + Title: Accelerating Drug Discovery Using Convolution Neural Network Based Active Learning + Journal: + Contributors: Pengfei Liu, Kwong-Sak Leung. + DOI-URL: https://doi.org/10.1109/TENCON.2018.8650298 + + + Title: An Adaptive Lightweight Security Framework Suited for IoT + Journal: + Contributors: Menachem Domb. + DOI-URL: https://doi.org/10.5772/intechopen.73712 + + + Title: Adaptive mining and model building of medicinal chemistry data with a multi-metric perspective + Journal: Future Medicinal Chemistry + Contributors: JB Brown. + DOI-URL: https://doi.org/10.4155/fmc-2018-0188 + + + Title: Chemogenomic Active Learning's Domain of Applicability on Small, Sparse qHTS Matrices: A Study Using Cytochrome P450 and Nuclear Hormone Receptor Families + Journal: ChemMedChem + Contributors: Christin Rakers, Rifat Ara Najnin, Ahsan Habib Polash, Shunichi Takeda, J.B. Brown. + DOI-URL: https://doi.org/10.1002/cmdc.201700677 + + + Title: Automating drug discovery + Journal: Nature Reviews Drug Discovery + Contributors: Gisbert Schneider. + DOI-URL: https://doi.org/10.1038/nrd.2017.232 + + + Title: Classifiers and their Metrics Quantified + Journal: Molecular Informatics + Contributors: J. B. Brown. + DOI-URL: https://doi.org/10.1002/minf.201700127 + + + Title: Active Search for Computer-aided Drug Design + Journal: Molecular Informatics + Contributors: Dino Oglic, Steven A. Oatley, Simon J. F. Macdonald, Thomas Mcinally, Roman Garnett, Jonathan D. Hirst, Thomas Gärtner. + DOI-URL: https://doi.org/10.1002/minf.201700130 + + + Title: Selection of Informative Examples in Chemogenomic Datasets + Journal: + Contributors: Daniel Reker, J. B. Brown. + DOI-URL: https://doi.org/10.1007/978-1-4939-8639-2_13 + + + Title: The value of prior knowledge in machine learning of complex network systems + Journal: Bioinformatics + Contributors: Dana Ferranti, David Krane, David Craft, . + DOI-URL: https://doi.org/10.1093/bioinformatics/btx438 + + + Title: Lightweight adaptive Random-Forest for IoT rule generation and execution + Journal: Journal of Information Security and Applications + Contributors: Menachem Domb, Elisheva Bonchek-Dokow, Guy Leshem. + DOI-URL: https://doi.org/10.1016/j.jisa.2017.03.001 + + + Title: Active learning for computational chemogenomics + Journal: Future Medicinal Chemistry + Contributors: Daniel Reker, Petra Schneider, Gisbert Schneider, JB Brown. + DOI-URL: https://doi.org/10.4155/fmc-2016-0197 + + + Title: Small Random Forest Models for Effective Chemogenomic Active Learning + Journal: Journal of Computer Aided Chemistry + Contributors: Christin Rakers, Daniel Reker, J.B. Brown. + DOI-URL: https://doi.org/10.2751/jcac.18.124 + + + Title: Large-Scale Off-Target Identification Using Fast and Accurate Dual Regularized One-Class Collaborative Filtering and Its Application to Drug Repurposing + Journal: PLOS Computational Biology + Contributors: Hansaim Lim, Aleksandar Poleksic, Yuan Yao, Hanghang Tong, Di He, Luke Zhuang, Patrick Meng, Lei Xie, . + DOI-URL: https://doi.org/10.1371/journal.pcbi.1005135 + +Article title: Matched Molecular Series: Measuring SAR Similarity +Publication date: May 1, 2017 +DOI-URL: https://doi.org/10.1021/acs.jcim.6b00709 + +Subjects: +Substituents, Mathematical methods, Structure activity relationship, Biological databases + +Contributors: +Emanuel S. R. Ehmki, Christian Kramer + +This publication is cited by the following 5 publications: + + + Title: Matched Molecular Series Analysis for ADME Property Prediction + Journal: Journal of Chemical Information and Modeling + Contributors: Mahendra Awale, Sereina Riniker, Christian Kramer. + DOI-URL: https://doi.org/10.1021/acs.jcim.0c00269 + + + Title: Approaches using AI in medicinal chemistry + Journal: + Contributors: Christian Tyrchan, Eva Nittinger, Dea Gogishvili, Atanas Patronov, Thierry Kogej. + DOI-URL: https://doi.org/10.1016/B978-0-12-822249-2.00002-5 + + + Title: Bioactivity Prediction Based on Matched Molecular Pair and Matched Molecular Series Methods + Journal: Current Pharmaceutical Design + Contributors: Xiaoyu Ding, Chen Cui, Dingyan Wang, Jihui Zhao, Mingyue Zheng, Xiaomin Luo, Hualiang Jiang, Kaixian Chen. + DOI-URL: https://doi.org/10.2174/1381612826666200427111309 + + + Title: BRADSHAW: a system for automated molecular design + Journal: Journal of Computer-Aided Molecular Design + Contributors: Darren V. S. Green, Stephen Pickett, Chris Luscombe, Stefan Senger, David Marcus, Jamel Meslamani, David Brett, Adam Powell, Jonathan Masson. + DOI-URL: https://doi.org/10.1007/s10822-019-00234-8 + + + Title: The use of matched molecular series networks for cross target structure activity relationship translation and potency prediction + Journal: MedChemComm + Contributors: Christopher E. Keefer, George Chang. + DOI-URL: https://doi.org/10.1039/C7MD00465F + diff --git a/verarbeitung/Processing.py b/verarbeitung/Processing.py new file mode 100644 index 0000000000000000000000000000000000000000..0dcc7391bd5a633a86841f6097f486017ae94dfa --- /dev/null +++ b/verarbeitung/Processing.py @@ -0,0 +1,247 @@ +# -*- coding: utf-8 -*- +""" +Functions to generate a graph representing citations between multiple ACS/Nature journals + +""" + +__authors__ = "Donna Löding, Alina Molkentin, Xinyi Tang, Judith Große, Malte Schokolowski" +__email__ = "cis-project2021@zbh.uni-hamburg.de" +__status__ = "Production" +#__copyright__ = "" +#__credits__ = ["", "", "", ""] +#__license__ = "" +#__version__ = "" +#__maintainer__ = "" + +from bs4 import BeautifulSoup as bs +import requests as req +import sys +from pathlib import Path +from input_fj import input +from input_test import input_test_func +from json_demo import output_to_json + +# adds every publication from input list to graph structure +# doi_input_list: list of publication dois from user +def initialize_nodes_list(doi_input_list, search_depth_max, search_height_max, test_var): + references_pub_obj_list = [] + citations_pub_obj_list = [] + + for pub_doi in doi_input_list: + + #checks if its a test and chooses input function accordingly + if(test_var): + pub = input_test_func(pub_doi) + else: + pub = input(pub_doi) + + # checks if publication already exists in nodes + not_in_nodes = True + for node in nodes: # checks if a pub is already in nodes + if (pub.doi_url == node.doi_url): + not_in_nodes = False + break + if (not_in_nodes): + nodes.append(pub) + pub.group = "input" + else: + doi_input_list.remove(pub_doi) + + # inserts references as publication objects into list and + # inserts first depth references into nodes/edges if maximum search depth > 0 + for reference in create_graph_structure_references(pub, 0, search_depth_max, test_var): + references_pub_obj_list.append(reference) + + # inserts citations as publication objects into list and + # inserts first height citations into nodes if maximum search height > 0 + for citation in create_graph_structure_citations(pub, 0, search_height_max, test_var): + citations_pub_obj_list.append(citation) + + return(references_pub_obj_list, citations_pub_obj_list) + + +# adds edges between citation and reference group +def complete_inner_edges(test_var): + for node in nodes: + if (node.group == "depth"): + for citation in node.citations: + for cit in nodes: + if (citation.doi_url == cit.doi_url and [citation.doi_url, node.doi_url] not in edges): + edges.append([citation.doi_url, node.doi_url]) + if (node.group == "height"): + for reference in node.references: + for ref in nodes: + if (reference.doi_url == ref.doi_url and [node.doi_url, reference.doi_url] not in edges): + edges.append([node.doi_url,reference.doi_url]) + + + +# adds a node for every publication unknown +# adds edges for references between publications +def create_graph_structure_references(pub, search_depth, search_depth_max, test_var): + references_pub_obj_list = [] + for reference in pub.references: + not_in_nodes = True + for node in nodes: + # checks every reference for duplication + if (reference.doi_url == node.doi_url): + not_in_nodes = False + break + if (not_in_nodes): + if (search_depth < search_depth_max): + + #checks if its a test and chooses input function accordingly + if (test_var): + reference_pub_obj = input_test_func(reference.doi_url) + else: + reference_pub_obj = input(reference.doi_url) + + reference_pub_obj.group = "depth" + nodes.append(reference_pub_obj) + edges.append([pub.doi_url,reference_pub_obj.doi_url]) + references_pub_obj_list.append(reference_pub_obj) + + # adds edge only if citation already exists + elif [pub.doi_url,reference.doi_url] not in edges: + edges.append([pub.doi_url,reference.doi_url]) + return references_pub_obj_list + + +# recursive function to implement height-first-search on references +# references_pub_obj_list: input list of references as publication objects +# search_depth: current search_depth of height-first-search +# search_depth_max: maximal search_depth for dfs +def process_references_rec(references_pub_obj_list, search_depth, search_depth_max, test_var): + # adds next level to nodes/edges + for pub in references_pub_obj_list: + new_reference_pub_obj_list = create_graph_structure_references(pub, search_depth, search_depth_max, test_var) + + # If the maximum height has not yet been reached, calls function recursivly with increased height + if (search_depth < search_depth_max): + process_references_rec(new_reference_pub_obj_list, search_depth+1, search_depth_max, test_var) + + + + +# adds a node for every publication unknown +# adds edges for citations between publications +def create_graph_structure_citations(pub, search_height, search_height_max, test_var): + citations_pub_obj_list = [] + for citation in pub.citations: + not_in_nodes = True + for node in nodes: + # checks every citation for duplication + if (citation.doi_url == node.doi_url): + not_in_nodes = False + break + if (not_in_nodes): + if (search_height < search_height_max): + + #checks if its a test and chooses input function accordingly + if (test_var): + citation_pub_obj = input_test_func(citation.doi_url) + else: + citation_pub_obj = input(citation.doi_url) + + citation_pub_obj.group = "height" + nodes.append(citation_pub_obj) + edges.append([citation_pub_obj.doi_url,pub.doi_url]) + citations_pub_obj_list.append(citation_pub_obj) + + # adds only edge if citation already exists + elif [citation.doi_url,pub.doi_url] not in edges: + edges.append([citation.doi_url,pub.doi_url]) + return citations_pub_obj_list + + + +# recursive function to implement height-first-search on citations +# citations_pub_obj_list: input list of citations as publication objects +# search_height: current search_height of height-first-search +# search_height_max: maximal search_height for dfs +def process_citations_rec(citations_pub_obj_list, search_height, search_height_max, test_var): + # adds next level to nodes/edges + for pub in citations_pub_obj_list: + new_citation_pub_obj_list = create_graph_structure_citations(pub, search_height, search_height_max, test_var) + + # If the maximum height has not yet been reached, calls function recursivly with increased height + if (search_height < search_height_max): + process_citations_rec(new_citation_pub_obj_list, search_height+1, search_height_max, test_var) + + + + +# main function to call. Needs as input: +# doi_input_list: input list of dois +# search_height: max search height to process to +# search_depth: max search depth to process to +# test_var: only needed for unit test as True, default is False +def process_main(doi_input_list, search_height, search_depth, test_var = False): + # ERROR-Handling doi_array = NULL + if (len(doi_input_list) == 0): + print("Error, no input data") + + # ERROR- if a negative number is entered for height + if (search_height < 0): + print("Error, search_height of search must be positive") + + # ERROR- if a negative number is entered for depth + if (search_depth < 0): + print("Error, search_depth of search must be positive") + + # create empty array for the nodes + # create empty array for the edges + global nodes, edges + nodes = [] + edges = [] + + # initializes nodes/edges from input and gets a list with publication objects for citations and references returned + references_obj_list, citations_obj_list = initialize_nodes_list(doi_input_list,search_depth, search_height, test_var) + + # function calls to begin recursive processing up to max depth/height + process_citations_rec(citations_obj_list, 1, search_height, test_var) + process_references_rec(references_obj_list, 1, search_depth, test_var) + + # adds edges between reference group and citation group of known publications + complete_inner_edges(test_var) + + # calls a skript to save nodes and edges of graph in .json file + output_to_json(nodes,edges) + + # only for unit tests + if (test_var == True): + doi_nodes_list = [] + for node in nodes: + doi_nodes_list.append(node.doi_url) + return(doi_nodes_list, edges) + + + + +# a function to print nodes and edges from a graph +def print_graph(nodes, edges): + print("Knoten:\n") + for node in nodes: + print(node.title, "\n") + print("\nKanten:\n") + for edge in edges: + print(edge,"\n") + + +# program test, because there is no connection to UI yet. +def try_known_publications(): + doi_list = [] + doi_list.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249') + #arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249') + doi_list.append('https://doi.org/10.1021/acs.jmedchem.0c01332') + #arr.append('https://doi.org/10.1021/acs.jcim.0c00741') + + #arr.append('https://doi.org/10.1021/ci700007b') + #arr.append('https://doi.org/10.1021/acs.jcim.5b00292') + #url = sys.argv[1] + #arr.append[url] + + + nodes,edges = process_main(doi_list,2,2) + + print_graph(nodes, edges) \ No newline at end of file diff --git a/verarbeitung/Processing_unittest.py b/verarbeitung/Processing_unittest.py new file mode 100644 index 0000000000000000000000000000000000000000..772d57204ce3374211d1d1fd3d08d279f085aac3 --- /dev/null +++ b/verarbeitung/Processing_unittest.py @@ -0,0 +1,66 @@ +import unittest +from Processing import process_main + +class ProcessingTest(unittest.TestCase): + def testCycle(self): + nodes, edges = process_main(['doiz1'],1,1,True) + self.assertCountEqual(nodes, ['doiz1', 'doiz2']) + self.assertCountEqual(edges, [['doiz1', 'doiz2'], ['doiz2', 'doiz1']]) + + nodes, edges = process_main(['doiz1'],2,2,True) + self.assertCountEqual(nodes, ['doiz1', 'doiz2']) + self.assertCountEqual(edges, [['doiz2', 'doiz1'], ['doiz1', 'doiz2']]) + + #def testBigCycle(self): + + #def testEmptyHeight(self): + + #def testEmptyDepth(self): + + def testEmptyDepthHeight(self): + nodes, edges = process_main(['doi1'],0,0,True) + self.assertCountEqual(nodes,['doi1']) + self.assertCountEqual(edges, []) + + nodes, edges = process_main(['doi1', 'doi2'],0,0,True) + self.assertCountEqual(nodes, ['doi1','doi2']) + self.assertCountEqual(edges, [['doi1', 'doi2']]) + + nodes, edges = process_main(['doi1', 'doi2', 'doi3'],0,0,True) + self.assertCountEqual(nodes, ['doi1','doi2', 'doi3']) + self.assertCountEqual(edges, [['doi3', 'doi1'], ['doi1', 'doi2']]) + + + def testInnerEdges(self): + nodes, edges = process_main(['doi_ie1'],1,1,True) + self.assertCountEqual(nodes,['doi_ie1','doi_ie2','doi_ie3']) + self.assertCountEqual(edges,[['doi_ie1','doi_ie2'],['doi_ie3','doi_ie1'],['doi_ie3','doi_ie2']]) + + def testRightHeight(self): + nodes, edges = process_main(['doi_h01'],1,0,True) + self.assertCountEqual(nodes,['doi_h01']) + self.assertCountEqual(edges, []) + + nodes, edges = process_main(['doi_h02'],1,0,True) + self.assertCountEqual(nodes,['doi_h02','doi_h1']) + self.assertCountEqual(edges, [['doi_h1','doi_h02']]) + + nodes, edges = process_main(['doi_h02'],2,0,True) + self.assertCountEqual(nodes,['doi_h02','doi_h1','doi_h2']) + self.assertCountEqual(edges, [['doi_h1','doi_h02'], ['doi_h2','doi_h1']]) + + def testRightDepth(self): + nodes, edges = process_main(['doi_d01'],0,1,True) + self.assertCountEqual(nodes,['doi_d01']) + self.assertCountEqual(edges, []) + + nodes, edges = process_main(['doi_d02'],0,1,True) + self.assertCountEqual(nodes,['doi_d02','doi_d1']) + self.assertCountEqual(edges, [['doi_d02','doi_d1']]) + + nodes, edges = process_main(['doi_d02'],0,2,True) + self.assertCountEqual(nodes,['doi_d02','doi_d1','doi_d2']) + self.assertCountEqual(edges, [['doi_d02','doi_d1'], ['doi_d1','doi_d2']]) + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/verarbeitung/__pycache__/Processing.cpython-36.pyc b/verarbeitung/__pycache__/Processing.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eb6d8a0418a1340b746f2f664997515622356d8a Binary files /dev/null and b/verarbeitung/__pycache__/Processing.cpython-36.pyc differ diff --git a/verarbeitung/__pycache__/Processing.cpython-38.pyc b/verarbeitung/__pycache__/Processing.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..63ac529316c848e829cd83ef44ec749e5903bf9e Binary files /dev/null and b/verarbeitung/__pycache__/Processing.cpython-38.pyc differ diff --git a/verarbeitung/__pycache__/Processing.cpython-39.pyc b/verarbeitung/__pycache__/Processing.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..54c63251bbf3affbdd176d3d55f4956c2fc08406 Binary files /dev/null and b/verarbeitung/__pycache__/Processing.cpython-39.pyc differ diff --git a/verarbeitung/__pycache__/Processing_pub_objs_only.cpython-39.pyc b/verarbeitung/__pycache__/Processing_pub_objs_only.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ce1023e6ea54e1b04b37ad5a1fd08115d5f52a4 Binary files /dev/null and b/verarbeitung/__pycache__/Processing_pub_objs_only.cpython-39.pyc differ diff --git a/verarbeitung/__pycache__/input_fj.cpython-36.pyc b/verarbeitung/__pycache__/input_fj.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..04312c91f0a7675651e99a2a6c10a2c9da146758 Binary files /dev/null and b/verarbeitung/__pycache__/input_fj.cpython-36.pyc differ diff --git a/verarbeitung/__pycache__/input_fj.cpython-38.pyc b/verarbeitung/__pycache__/input_fj.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..515ab99c01a5ce78bb5bb6de554a4dae3ffe4b4b Binary files /dev/null and b/verarbeitung/__pycache__/input_fj.cpython-38.pyc differ diff --git a/verarbeitung/__pycache__/input_fj.cpython-39.pyc b/verarbeitung/__pycache__/input_fj.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..175f9ebbfdf5f3313196b4f10aa01dc2e8e20509 Binary files /dev/null and b/verarbeitung/__pycache__/input_fj.cpython-39.pyc differ diff --git a/verarbeitung/__pycache__/input_test.cpython-36.pyc b/verarbeitung/__pycache__/input_test.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..85878d6d127d9d2bd5efe9130672d982bb70c5fa Binary files /dev/null and b/verarbeitung/__pycache__/input_test.cpython-36.pyc differ diff --git a/verarbeitung/__pycache__/input_test.cpython-38.pyc b/verarbeitung/__pycache__/input_test.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df395212453392e135532b12396cd4c30a92ea05 Binary files /dev/null and b/verarbeitung/__pycache__/input_test.cpython-38.pyc differ diff --git a/verarbeitung/__pycache__/input_test.cpython-39.pyc b/verarbeitung/__pycache__/input_test.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..68e42fd6a47a02787524c68816a42574834931d2 Binary files /dev/null and b/verarbeitung/__pycache__/input_test.cpython-39.pyc differ diff --git a/verarbeitung/__pycache__/json_demo.cpython-36.pyc b/verarbeitung/__pycache__/json_demo.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..04acef5f40630ee2c7b6e887e33dc740b5e16a74 Binary files /dev/null and b/verarbeitung/__pycache__/json_demo.cpython-36.pyc differ diff --git a/verarbeitung/__pycache__/json_demo.cpython-38.pyc b/verarbeitung/__pycache__/json_demo.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4a1e7ba987775a20fddaa4a8f846bb238670d6a1 Binary files /dev/null and b/verarbeitung/__pycache__/json_demo.cpython-38.pyc differ diff --git a/verarbeitung/__pycache__/json_demo.cpython-39.pyc b/verarbeitung/__pycache__/json_demo.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4e31ce337645d5282ddab11668bc6d745735f9f8 Binary files /dev/null and b/verarbeitung/__pycache__/json_demo.cpython-39.pyc differ diff --git a/verarbeitung/__pycache__/unittest.cpython-36.pyc b/verarbeitung/__pycache__/unittest.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..245eb7f9be9221daa930d9fa83c77368ba463af7 Binary files /dev/null and b/verarbeitung/__pycache__/unittest.cpython-36.pyc differ diff --git a/verarbeitung/input_test.py b/verarbeitung/input_test.py new file mode 100644 index 0000000000000000000000000000000000000000..44361c4b095f1c4fb0fce1868498d0e9da32f551 --- /dev/null +++ b/verarbeitung/input_test.py @@ -0,0 +1,82 @@ +class Publication: + def __init__(self, doi_url, title, contributors, journal, publication_date, references, citations, group): + self.doi_url = doi_url + self.title = title + self.contributors = contributors + self.journal = journal + self.publication_date = publication_date + if references is None: + self.references = [] + else: + self.references = ref(references) + if citations is None: + self.citations = [] + else: + self.citations = cit(citations) + self.group = group + + +class Citation: + def __init__(self,doi_url, title, contributors, journal, publication_date): + self.doi_url = doi_url + self.title = title + self.contributors = contributors + self.journal = journal + self.publication_date = publication_date + +class Reference: + def __init__(self,doi_url, title, contributors, journal, publication_date): + self.doi_url = doi_url + self.title = title + self.contributors = contributors + self.journal = journal + self.publication_date = publication_date + +def input_test_func(pub_doi): + for array in list_of_arrays: + if pub_doi == array[0]: + pub = Publication(array[0], array[1], array[2], array[3], array[4], array[5], array[6], array[7]) + return pub + + +def cit(list_doi): + cits = [] + for doi_url in list_doi: + for array in list_of_arrays: + if doi_url == array[0]: + cits.append(Citation(array[0], array[1], array[2], array[3], array[4])) + return cits + +def ref(list_doi): + refs = [] + for doi_url in list_doi: + for array in list_of_arrays: + if doi_url == array[0]: + refs.append(Citation(array[0], array[1], array[2], array[3], array[4])) + return refs + + +beispiel1 = ['doi1', 'title1', ['contributor1'], 'journal1', 'date1', ['doi2'], ['doi3'], ''] +beispiel2 = ['doi2', 'title2', ['contributor2'], 'journal2', 'date2', [], ['doi1'], ''] +beispiel3 = ['doi3', 'title3', ['contributor3'], 'journal3', 'date3', ['doi1'], [], ''] + +zyklus1 = ['doiz1', 'titlez1', ['contributorz1.1', 'contributorz1.2'], 'journalz1', 'datez1', ['doiz2'], ['doiz2'], ''] +zyklus2 = ['doiz2', 'titlez2', ['contributorz2.1', 'contributorz2.2'], 'journalz2', 'datez2', ['doiz1'], ['doiz1'], ''] + +inner_edge1 = ['doi_ie1', 'title_ie1', ['contributor_ie1.1', 'contributor_ie1.2'], 'journal_ie1', 'date_ie1', ['doi_ie2'], ['doi_ie3'], ''] +inner_edge2 = ['doi_ie2', 'title_ie2', ['contributor_ie2.1', 'contributor_ie2.2'], 'journal_ie2', 'date_ie2', [], ['doi_ie1','doi_ie3'], ''] +inner_edge3 = ['doi_ie3', 'titlez_ie3', ['contributor_ie3.1', 'contributor_ie3.2'], 'journal_ie3', 'date_ie3', ['doi_ie1','doi_ie2'], [], ''] + +right_height01 = ['doi_h01', 'title_h01', ['contributor_h01'], 'journal_h01', 'date_h01', [], [], ''] +right_height02 = ['doi_h02', 'title_h02', ['contributor_h02'], 'journal_h02', 'date_h02', [], ['doi_h1'], ''] +right_height1 = ['doi_h1', 'title_h1', ['contributor_h1'], 'journal_h1', 'date_h1', [], ['doi_h2'], ''] +right_height2 = ['doi_h2', 'title_h2', ['contributor_h2'], 'journal_h2', 'date_h2', [], ['doi_h3'], ''] +right_height3 = ['doi_h3', 'title_h3', ['contributor_h3'], 'journal_h3', 'date_h3', [], [], ''] + +right_depth01 = ['doi_d01', 'title_d01', ['contributor_d01'], 'journal_d01', 'date_d01', [], [], ''] +right_depth02 = ['doi_d02', 'title_d02', ['contributor_d02'], 'journal_d02', 'date_d02', ['doi_d1'], [], ''] +right_depth1 = ['doi_d1', 'title_d1', ['contributor_d1'], 'journal_d1', 'date_d1', ['doi_d2'], [], ''] +right_depth2 = ['doi_d2', 'title_d2', ['contributor_d2'], 'journal_d2', 'date_d2', ['doi_d3'], [], ''] +right_depth3 = ['doi_d3', 'title_d3', ['contributor_d3'], 'journal_d3', 'date_d3', [], [], ''] + +list_of_arrays = [beispiel1, beispiel2, beispiel3, zyklus1, zyklus2, inner_edge1, inner_edge2, inner_edge3, right_height01, right_height02, right_height1, right_height2, right_height3, right_depth01, right_depth02, right_depth1, right_depth2, right_depth3] diff --git a/verarbeitung/json_demo.py b/verarbeitung/json_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..b9f618d1a2dcac13ca51a530f365d40aa226bc11 --- /dev/null +++ b/verarbeitung/json_demo.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +import json +from input_fj import input + +""" +Functions that format the computed graph to match the interface to the output-part + +""" + +# creates a list that contains a dictionary for each node +# the dictionaries store the values for the attributes +def format_nodes(V): + list_of_node_dicts = list() + for node in V: + new_dict = dict() + new_dict["name"] = node.title + new_dict["author"] = node.contributors + new_dict["year"] = node.publication_date + new_dict["journal"] = node.journal + new_dict["doi"] = node.doi_url + new_dict["group"] = node.group + list_of_node_dicts.append(new_dict) + return list_of_node_dicts + +# creates a list that contains a disctionary for each edge +# the dictionaries contain the source as keys and the target as values +def format_edges(E): + list_of_edge_dicts = list() + for edge in E: + new_dict_2 = dict() + new_dict_2["source"] = edge[0] + new_dict_2["target"] = edge[1] + list_of_edge_dicts.append(new_dict_2) + return list_of_edge_dicts + +# combine the lists of nodes and edges to a dictionary and saves it to a json file +def output_to_json(V,E): + dict_of_all = dict() + list_of_node_dicts = format_nodes(V) + list_of_edge_dicts = format_edges(E) + dict_of_all["nodes"] = list_of_node_dicts + dict_of_all["links"] = list_of_edge_dicts + with open('json_text.json','w') as outfile: + json.dump(dict_of_all, outfile) + +#knoten = ["doi1", "doi2", "doi3"] +#kanten = [[1,2],[3,4],[5,6]] +#output_to_json(knoten,kanten) + diff --git "a/verarbeitung/n\303\266tige Tests.txt" "b/verarbeitung/n\303\266tige Tests.txt" new file mode 100644 index 0000000000000000000000000000000000000000..95563280436fbf6b9b8702dffef6f32e213f5a16 --- /dev/null +++ "b/verarbeitung/n\303\266tige Tests.txt" @@ -0,0 +1,4 @@ +Zyklus +großer Zyklus +Innere Kanten vervollständigen +