diff --git a/README.md b/README.md index c109cc8df99fda480c5ac4b666d258c53c6e46c8..d4d320b0fdc75bcfe55b8e8b18eebe1b51f986fc 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ # Starten des Programms: -Um das Programm nutzen zu können muss zuerst \grqq citation\_parser\_ui.py\grqq \, ausgeführt werden und der entstandene Lik in einen Browser eingefügt werden. Danach öffnet sich die Benutzeroberfläche im Browser. +Um das Programm nutzen zu können muss zuerst 'citation\_parser\_ui.py', ausgeführt werden und der entstandene Lik in einen Browser eingefügt werden. Danach öffnet sich die Benutzeroberfläche im Browser. # Übersicht der Benutzeroberfläche: @@ -43,5 +43,6 @@ Um das Programm nutzen zu können muss zuerst \grqq citation\_parser\_ui.py\grqq - Alina Molkentin - Donna Löding - Malte Schokolowski +- Judith Große - Katja Ehlers - Merle Stahl diff --git a/verarbeitung/construct_new_graph/add_citations_rec.py b/verarbeitung/construct_new_graph/add_citations_rec.py index 9467fe412b01c7f4d78d20c80296b8cb3f184373..82ae23fd9c38c4e8ee150be2418d393386ac42c1 100644 --- a/verarbeitung/construct_new_graph/add_citations_rec.py +++ b/verarbeitung/construct_new_graph/add_citations_rec.py @@ -6,7 +6,7 @@ Functions to add citations recursively for multiple ACS/Nature journals __authors__ = "Donna Löding, Alina Molkentin, Judith Große, Malte Schokolowski" __email__ = "cis-project2021@zbh.uni-hamburg.de" -__status__ = "Production" +__status__ = "Finished" # __copyright__ = "" # __credits__ = ["", "", "", ""] @@ -28,11 +28,26 @@ from verarbeitung.get_pub_from_input import get_pub def create_graph_structure_citations_test(pub, search_depth, search_depth_max, cit_type, test_var, test_nodes, test_edges): ''' - :param test_nodes: list of publications from unit test - :type test_nodes: List[Publication] + :param pub: publication which citations will be added + :type pub: Publication + + :param search_depth: current depth to search for citations + :type search_depth_max: int + + :param search_depth_max: maximum depth to search for citations + :type search_depth_max: int - :param test_edges: list of links from unit test - :type test_edges: List[List[String,String]] + :param cit_type: variable to differentiate citation and reference call + :type cit_type: String + + :param test_var: variable to differentiate between test and url call + :type test_var: boolean + + :param test_nodes: list of publications from unit test + :type test_nodes: List[Publication] + + :param test_edges: list of links from unit test + :type test_edges: List[List[String,String]] for unit test purposes only ''' @@ -117,12 +132,16 @@ def create_graph_structure_citations(pub, search_depth, search_depth_max, cit_ty citations_pub_obj_list = [] for citation in get_cit_type_list(pub, cit_type): not_in_nodes = True - for node in nodes: # checks every citation for duplication + for node in nodes: + + # checks every citation for duplication if (citation.doi_url == node.doi_url): not_in_nodes = False break if (not_in_nodes): - if (search_depth < search_depth_max): # checks if its a test and chooses input function accordingly + if (search_depth < search_depth_max): + + # checks if its a test and chooses input function accordingly citation_pub_obj = get_pub(citation.doi_url, test_var) if (type(citation_pub_obj) != Publication): # print(pub) @@ -165,11 +184,10 @@ def process_citations_rec(citations_pub_obj_list, search_depth, search_depth_max :param test_var: variable to differentiate between test and url call :type test_var: boolean - recursive function to implement depth-first-search on citations + recursive function to implement breadth-first-search on citations ''' - # adds next level to nodes/edges - + # searches citations for every publication in list and adds the one found to new list new_citation_pub_obj_save_list = [] for pub in citations_pub_obj_list: new_citation_pub_obj_list = create_graph_structure_citations(pub, search_depth, search_depth_max, cit_type, @@ -177,7 +195,7 @@ def process_citations_rec(citations_pub_obj_list, search_depth, search_depth_max if len(new_citation_pub_obj_list) > 0: new_citation_pub_obj_save_list += new_citation_pub_obj_list - # If the maximum depth has not yet been reached, calls function recursively with increased depth + # If the maximum depth has not yet been reached, calls function recursively with increased depth if (search_depth < search_depth_max): process_citations_rec(new_citation_pub_obj_save_list, search_depth + 1, search_depth_max, cit_type, test_var) diff --git a/verarbeitung/construct_new_graph/export_to_json.py b/verarbeitung/construct_new_graph/export_to_json.py index 00627dfb36edc6bcadce5c74b9487f7652f68921..2c40da91b8549faf15b6a08fd8d2a09c83b3ce21 100644 --- a/verarbeitung/construct_new_graph/export_to_json.py +++ b/verarbeitung/construct_new_graph/export_to_json.py @@ -6,7 +6,7 @@ Functions that format the computed graph to match the interface to the output-pa __authors__ = "Donna Löding, Alina Molkentin, Judith Große, Malte Schokolowski" __email__ = "cis-project2021@zbh.uni-hamburg.de" -__status__ = "Production" +__status__ = "Finished" # __copyright__ = "" # __credits__ = ["", "", "", ""] @@ -44,9 +44,7 @@ def format_nodes(nodes): list_of_node_dicts.append(new_dict) return list_of_node_dicts - -# creates a list that contains a dictionary for each edge -# the dictionaries contain the source as keys and the target as values + def format_edges(edges): ''' :param edges: list of links to export to json @@ -74,7 +72,7 @@ def output_to_json(nodes, edges, search_depth, search_height, json_file='json_te :param test_var: variable to differentiate between test and url call :type test_var: boolean - function to export nodes and links as a dictionary to json file + function to export nodes and links as a dictionary to a given json file ''' dict_of_all = dict() list_of_node_dicts = format_nodes(nodes) @@ -82,6 +80,8 @@ def output_to_json(nodes, edges, search_depth, search_height, json_file='json_te dict_of_all["nodes"] = list_of_node_dicts dict_of_all["links"] = list_of_edge_dicts dict_of_all["depth_height"] = [search_depth, search_height] + + # output to json. json name depends on test_var and if a non standard filename was given. if (test_var): if json_file != 'json_text.json': with open(json_file, 'w') as outfile: diff --git a/verarbeitung/construct_new_graph/initialize_graph.py b/verarbeitung/construct_new_graph/initialize_graph.py index 81571dbe0e9a193a1dfcbc0a92f8240d95c50b7c..92f9e941db80f92e4a6419b19b576d040ceaabd9 100644 --- a/verarbeitung/construct_new_graph/initialize_graph.py +++ b/verarbeitung/construct_new_graph/initialize_graph.py @@ -6,7 +6,7 @@ Functions to generate a graph representing citations between multiple ACS/Nature __authors__ = "Donna Löding, Alina Molkentin, Judith Große, Malte Schokolowski" __email__ = "cis-project2021@zbh.uni-hamburg.de" -__status__ = "Production" +__status__ = "Finished" # __copyright__ = "" # __credits__ = ["", "", "", ""] @@ -24,17 +24,29 @@ sys.path.append("../") from input.publication import Publication from verarbeitung.get_pub_from_input import get_pub -from .export_to_json import output_to_json from .add_citations_rec import add_citations, create_global_lists_cit def initialize_nodes_list_test(doi_input_list, search_depth_max, search_height_max, test_var): ''' + :param doi_input_list: input list of doi from UI + :type doi_input_list: List[String] + + :param search_depth_max: maximum depth to search for references + :type search_depth_max: int + + :param search_height_max: maximum height to search for citations + :type search_height_max: int + + :param test_var: variable to differentiate between test and url call + :type test_var: boolean + for unit test purposes only ''' global nodes, edges nodes = [] edges = [] + return (initialize_nodes_list(doi_input_list, search_depth_max, search_height_max, test_var)) @@ -52,6 +64,7 @@ def complete_inner_edges_test(test_nodes, test_edges): global nodes, edges nodes = test_nodes edges = test_edges + complete_inner_edges() return (nodes, edges) @@ -132,11 +145,15 @@ def complete_inner_edges(update_var=False, input_nodes=[], input_edges=[]): for node in nodes: if (node.group < 0): + + # searches nodes fron 'other' group to find cross references for citation in node.citations: for pub in nodes: if ((pub.doi_url == citation.doi_url) and ([citation.doi_url, node.doi_url] not in edges)): edges.append([citation.doi_url, node.doi_url]) if (node.group > 0): + + # searches nodes fron 'other' group to find cross references for reference in node.references: for pub in nodes: if ((pub.doi_url == reference.doi_url) and ([node.doi_url, reference.doi_url] not in edges)): diff --git a/verarbeitung/dev_files/README.md b/verarbeitung/dev_files/README.md deleted file mode 100644 index f42d3f9d67fb15ff3d219354b76967e8b3fe794e..0000000000000000000000000000000000000000 --- a/verarbeitung/dev_files/README.md +++ /dev/null @@ -1 +0,0 @@ -Dieser Ordner ist nur für uns intern, um Testläufe mit echten DOIs zu starten. \ No newline at end of file diff --git a/verarbeitung/dev_files/__init__.py b/verarbeitung/dev_files/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/verarbeitung/dev_files/print_graph_test.py b/verarbeitung/dev_files/print_graph_test.py deleted file mode 100644 index 9fa8441a38fdc65514fdd52835bc558450708b05..0000000000000000000000000000000000000000 --- a/verarbeitung/dev_files/print_graph_test.py +++ /dev/null @@ -1,115 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Functions to test and print the nodes and edges sets - -""" - -__authors__ = "Donna Löding, Alina Molkentin, Judith Große, Malte Schokolowski" -__email__ = "cis-project2021@zbh.uni-hamburg.de" -__status__ = "Production" - -# __copyright__ = "" -# __credits__ = ["", "", "", ""] -# __license__ = "" -# __version__ = "" -# __maintainer__ = "" - - -import sys - -# sys.path.insert(1, 'C:\Users\Malte\Git\CiS-Projekt\ci-s-projekt-verarbeitung\input') -sys.path.append("../../") -from verarbeitung.construct_new_graph.initialize_graph import init_graph_construction -from verarbeitung.update_graph.import_from_json import input_from_json -from verarbeitung.update_graph.update_graph import update_graph - - -# a function to print nodes and edges from a graph -def print_graph(nodes, edges): - print("Knoten:\n") - for node in nodes: - print(node.title, "\n") - print("\nKanten:\n") - for edge in edges: - print(edge, "\n") - print(len(nodes)) - print(len(edges)) - print(" ") - - -def print_extended_graph(nodes, edges): - print("Knoten:\n") - for node in nodes: - print(node.title, "\n") - print(node.doi_url) - for reference in node.references: - print(reference.doi_url) - for citation in node.citations: - print(citation.doi_url) - print("\nKanten:\n") - for edge in edges: - print(edge, "\n") - print(len(nodes)) - print(len(edges)) - print(" ") - - -def print_simple(nodes, edges): - # for node in nodes: - # print(node) - # for edge in edges: - # print(edge) - print(len(nodes)) - print(len(edges)) - print(" ") - - -# program test with some random DOIs -def try_known_publications(): - doi_list = [] - doi_list.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249') - # doi_list.append('https://doi.org/10.1021/acs.jcim.9b00249') - doi_list.append('https://pubs.acs.org/doi/10.1021/acs.jcim.1c00203') - # arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249') - doi_list.append('https://doi.org/10.1021/acs.jmedchem.0c01332') - # arr.append('https://doi.org/10.1021/acs.jcim.0c00741') - - # arr.append('https://doi.org/10.1021/ci700007b') - # doi_list.append('https://doi.org/10.1021/acs.jcim.5b00292') - - # doi_list.append('https://pubs.acs.org/doi/10.1021/acs.jcim.0c00675') - # url = sys.argv[1] - # arr.append[url] - - nodes, edges = init_graph_construction(doi_list, 2, 2) - - print_graph(nodes, edges) - - return (nodes, edges) - - -def try_delete_nodes(): - doi_list = [] - doi_list.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249') - # doi_list.append('https://pubs.acs.org/doi/10.1021/acs.jcim.1c00203') - nodes, edges = init_graph_construction(doi_list, 1, 1) - # print_simple(nodes, edges) - - # list_of_nodes_py, list_of_edges_py = input_from_json('json_text.json') - # doi_list = [] - # doi_list.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249') - # valid_nodes, valid_edges = update_graph(doi_list, list_of_nodes_py, list_of_edges_py) - # print_simple(valid_nodes, valid_edges) - - -def try_import(): - nodes, edges = input_from_json('json_text.json') - print_extended_graph(nodes, edges) - - -# nodes, edges = try_known_publications() -# nodes_new, edges_new = input_from_json("json_text.json") -# print_graph(nodes_new, edges_new) -try_delete_nodes() - -# try_import() diff --git a/verarbeitung/get_pub_from_input.py b/verarbeitung/get_pub_from_input.py index f27b72d8601387e1e72fd10ff7432a4f768f001f..c1fd8e7e02b901754fc4462a340f20d512b18f7a 100644 --- a/verarbeitung/get_pub_from_input.py +++ b/verarbeitung/get_pub_from_input.py @@ -6,7 +6,7 @@ A function to return an object of Type Publication for a given DOI __authors__ = "Donna Löding, Alina Molkentin, Judith Große, Malte Schokolowski" __email__ = "cis-project2021@zbh.uni-hamburg.de" -__status__ = "Production" +__status__ = "Finished" #__copyright__ = "" #__credits__ = ["", "", "", ""] #__license__ = "" diff --git a/verarbeitung/json_text.json b/verarbeitung/json_text.json deleted file mode 100644 index f77fc38c5d062f2c4eb808b2180e4256c493681b..0000000000000000000000000000000000000000 --- a/verarbeitung/json_text.json +++ /dev/null @@ -1 +0,0 @@ -{"nodes": [{"doi": "doi_lg_1_i", "name": "title_lg_1_i", "author": ["contributor_lg_1_i"], "year": "date_lg_1_i", "journal": "journal_lg_1_i", "abstract": null, "group": "Input", "depth": 0, "citations": 2}, {"doi": "doi_lg_1_d11", "name": "title_lg_1_d11", "author": ["contributor_lg_1_d11"], "year": "date_lg_1_d11", "journal": "journal_lg_1_d11", "abstract": null, "group": "Reference", "depth": -1, "citations": 1}, {"doi": "doi_lg_1_d12", "name": "title_lg_1_d12", "author": ["contributor_lg_1_d12"], "year": "date_lg_1_d12", "journal": "journal_lg_1_d12", "abstract": null, "group": "Reference", "depth": -1, "citations": 2}, {"doi": "doi_lg_1_h11", "name": "title_lg_1_h11", "author": ["contributor_lg_1_h11"], "year": "date_lg_1_h11", "journal": "journal_lg_1_h11", "abstract": null, "group": "Citedby", "depth": 1, "citations": 2}, {"doi": "doi_lg_1_h12", "name": "title_lg_1_h12", "author": ["contributor_lg_1_h12"], "year": "date_lg_1_h12", "journal": "journal_lg_1_h12", "abstract": null, "group": "Citedby", "depth": 1, "citations": 2}, {"doi": "doi_lg_1_h21", "name": "title_lg_1_h21", "author": ["contributor_lg_1_h21"], "year": "date_lg_1_h21", "journal": "journal_lg_1_h21", "abstract": null, "group": "Citedby", "depth": 2, "citations": 0}, {"doi": "doi_lg_1_h22", "name": "title_lg_1_h22", "author": ["contributor_lg_1_h22"], "year": "date_lg_1_h22", "journal": "journal_lg_1_h22", "abstract": null, "group": "Citedby", "depth": 2, "citations": 0}, {"doi": "doi_lg_1_h23", "name": "title_lg_1_h23", "author": ["contributor_lg_1_h23"], "year": "date_lg_1_h23", "journal": "journal_lg_1_h23", "abstract": null, "group": "Citedby", "depth": 2, "citations": 0}, {"doi": "doi_lg_1_d21", "name": "title_lg_1_d21", "author": ["contributor_lg_1_d21"], "year": "date_lg_1_d21", "journal": "journal_lg_1_d21", "abstract": null, "group": "Reference", "depth": -2, "citations": 2}, {"doi": "doi_lg_1_d22", "name": "title_lg_1_d22", "author": ["contributor_lg_1_d22"], "year": "date_lg_1_d22", "journal": "journal_lg_1_d22", "abstract": null, "group": "Reference", "depth": -2, "citations": 2}, {"doi": "doi_lg_1_d23", "name": "title_lg_1_d23", "author": ["contributor_lg_1_d23"], "year": "date_lg_1_d23", "journal": "journal_lg_1_d23", "abstract": null, "group": "Reference", "depth": -2, "citations": 2}], "links": [{"source": "doi_lg_1_i", "target": "doi_lg_1_d11"}, {"source": "doi_lg_1_i", "target": "doi_lg_1_d12"}, {"source": "doi_lg_1_h11", "target": "doi_lg_1_i"}, {"source": "doi_lg_1_h12", "target": "doi_lg_1_i"}, {"source": "doi_lg_1_h21", "target": "doi_lg_1_h11"}, {"source": "doi_lg_1_h22", "target": "doi_lg_1_h11"}, {"source": "doi_lg_1_h22", "target": "doi_lg_1_h12"}, {"source": "doi_lg_1_h23", "target": "doi_lg_1_h12"}, {"source": "doi_lg_1_d11", "target": "doi_lg_1_d21"}, {"source": "doi_lg_1_d11", "target": "doi_lg_1_d22"}, {"source": "doi_lg_1_d12", "target": "doi_lg_1_d23"}, {"source": "doi_lg_1_d21", "target": "doi_lg_1_d22"}, {"source": "doi_lg_1_d22", "target": "doi_lg_1_d21"}, {"source": "doi_lg_1_h12", "target": "doi_lg_1_d12"}], "depth_height": ["new_height.json", true]} \ No newline at end of file diff --git a/verarbeitung/new_height.json b/verarbeitung/new_height.json deleted file mode 100644 index 0cc59401969ce8a984fee556a67099167f2144a6..0000000000000000000000000000000000000000 --- a/verarbeitung/new_height.json +++ /dev/null @@ -1 +0,0 @@ -{"nodes": [{"doi": "doi_lg_1_i", "name": "title_lg_1_i", "author": ["contributor_lg_1_i"], "year": "date_lg_1_i", "journal": "journal_lg_1_i", "abstract": null, "group": "Input", "depth": 0, "citations": 2}, {"doi": "doi_lg_1_d11", "name": "title_lg_1_d11", "author": ["contributor_lg_1_d11"], "year": "date_lg_1_d11", "journal": "journal_lg_1_d11", "abstract": null, "group": "Reference", "depth": -1, "citations": 1}, {"doi": "doi_lg_1_d12", "name": "title_lg_1_d12", "author": ["contributor_lg_1_d12"], "year": "date_lg_1_d12", "journal": "journal_lg_1_d12", "abstract": null, "group": "Reference", "depth": -1, "citations": 2}, {"doi": "doi_lg_1_h11", "name": "title_lg_1_h11", "author": ["contributor_lg_1_h11"], "year": "date_lg_1_h11", "journal": "journal_lg_1_h11", "abstract": null, "group": "Citedby", "depth": 1, "citations": 2}, {"doi": "doi_lg_1_h12", "name": "title_lg_1_h12", "author": ["contributor_lg_1_h12"], "year": "date_lg_1_h12", "journal": "journal_lg_1_h12", "abstract": null, "group": "Citedby", "depth": 1, "citations": 2}, {"doi": "doi_lg_1_h21", "name": "title_lg_1_h21", "author": ["contributor_lg_1_h21"], "year": "date_lg_1_h21", "journal": "journal_lg_1_h21", "abstract": null, "group": "Citedby", "depth": 2, "citations": 0}, {"doi": "doi_lg_1_h22", "name": "title_lg_1_h22", "author": ["contributor_lg_1_h22"], "year": "date_lg_1_h22", "journal": "journal_lg_1_h22", "abstract": null, "group": "Citedby", "depth": 2, "citations": 0}, {"doi": "doi_lg_1_h23", "name": "title_lg_1_h23", "author": ["contributor_lg_1_h23"], "year": "date_lg_1_h23", "journal": "journal_lg_1_h23", "abstract": null, "group": "Citedby", "depth": 2, "citations": 0}, {"doi": "doi_lg_1_d21", "name": "title_lg_1_d21", "author": ["contributor_lg_1_d21"], "year": "date_lg_1_d21", "journal": "journal_lg_1_d21", "abstract": null, "group": "Reference", "depth": -2, "citations": 2}, {"doi": "doi_lg_1_d22", "name": "title_lg_1_d22", "author": ["contributor_lg_1_d22"], "year": "date_lg_1_d22", "journal": "journal_lg_1_d22", "abstract": null, "group": "Reference", "depth": -2, "citations": 2}, {"doi": "doi_lg_1_d23", "name": "title_lg_1_d23", "author": ["contributor_lg_1_d23"], "year": "date_lg_1_d23", "journal": "journal_lg_1_d23", "abstract": null, "group": "Reference", "depth": -2, "citations": 2}], "links": [{"source": "doi_lg_1_i", "target": "doi_lg_1_d11"}, {"source": "doi_lg_1_i", "target": "doi_lg_1_d12"}, {"source": "doi_lg_1_h11", "target": "doi_lg_1_i"}, {"source": "doi_lg_1_h12", "target": "doi_lg_1_i"}, {"source": "doi_lg_1_h21", "target": "doi_lg_1_h11"}, {"source": "doi_lg_1_h22", "target": "doi_lg_1_h11"}, {"source": "doi_lg_1_h22", "target": "doi_lg_1_h12"}, {"source": "doi_lg_1_h23", "target": "doi_lg_1_h12"}, {"source": "doi_lg_1_d11", "target": "doi_lg_1_d21"}, {"source": "doi_lg_1_d11", "target": "doi_lg_1_d22"}, {"source": "doi_lg_1_d12", "target": "doi_lg_1_d23"}, {"source": "doi_lg_1_d21", "target": "doi_lg_1_d22"}, {"source": "doi_lg_1_d22", "target": "doi_lg_1_d21"}, {"source": "doi_lg_1_h12", "target": "doi_lg_1_d12"}], "depth_height": [2, 2]} \ No newline at end of file diff --git "a/verarbeitung/n\303\266tige Tests.txt" "b/verarbeitung/n\303\266tige Tests.txt" deleted file mode 100644 index 95563280436fbf6b9b8702dffef6f32e213f5a16..0000000000000000000000000000000000000000 --- "a/verarbeitung/n\303\266tige Tests.txt" +++ /dev/null @@ -1,4 +0,0 @@ -Zyklus -großer Zyklus -Innere Kanten vervollständigen - diff --git a/verarbeitung/process_main.py b/verarbeitung/process_main.py index 2567c24c9fea70dd4c62abc40dc26dd677bf99e1..05e580e3c1c91154fb1443a634814609c071bbc4 100644 --- a/verarbeitung/process_main.py +++ b/verarbeitung/process_main.py @@ -6,7 +6,7 @@ main function to call to generate a graph representing citations between multipl __authors__ = "Donna Löding, Alina Molkentin, Judith Große, Malte Schokolowski" __email__ = "cis-project2021@zbh.uni-hamburg.de" -__status__ = "Production" +__status__ = "Finished" #__copyright__ = "" #__credits__ = ["", "", "", ""] #__license__ = "" @@ -39,7 +39,7 @@ def Processing(url_list, search_depth, search_height, json_file = 'json_text.jso :param json_file: file to export graph to :type json_file: String - main function to construct new or updated publication graphs + main function to construct new or update known publication graph ''' # updates graph if json file is known in directory otherwise starts new graph construction diff --git a/verarbeitung/start_script.py b/verarbeitung/start_script.py index 92295eefbd363053762a3d6df99d028af44151b1..0e9ae8366c65b8b81032d114eb1f76db698eeaee 100644 --- a/verarbeitung/start_script.py +++ b/verarbeitung/start_script.py @@ -1,8 +1,13 @@ +""" + This file is for testing purposes only. We left it in the directory for ease of use. + To use it you need to shift it into the main directory of the project +""" + import sys import gc from pathlib import Path from verarbeitung.process_main import Processing -#from verarbeitung.dev_files.print_graph_test import try_known_publications, try_delete_nodes + doi_list = [] diff --git a/verarbeitung/test/construct_graph_unittest.py b/verarbeitung/test/construct_graph_unittest.py index 1cef0dd92393ecb062d6543f0c50ef32090710f8..5e0632a05ad37af7ac191253675d87058708aa9e 100644 --- a/verarbeitung/test/construct_graph_unittest.py +++ b/verarbeitung/test/construct_graph_unittest.py @@ -1,5 +1,21 @@ -import unittest +# -*- coding: utf-8 -*- +""" +Functions to unittest functions which construct a new graph + +""" + +__authors__ = "Donna Löding, Alina Molkentin, Judith Große, Malte Schokolowski" +__email__ = "cis-project2021@zbh.uni-hamburg.de" +__status__ = "Finished" +# __copyright__ = "" +# __credits__ = ["", "", "", ""] +# __license__ = "" +# __version__ = "" +# __maintainer__ = "" + + +import unittest import sys sys.path.append("../") @@ -98,7 +114,7 @@ class ConstructionTest(unittest.TestCase): self.assertCountEqual(edges, [['doi1', 'doi2'], ['doi3', 'doi1']]) self.assertCountEqual(err_list, ['doi2ic']) - ## From here the tests for the individual functions ## + ## from here: tests for the individual functions ## # initialize_graph.py: diff --git a/verarbeitung/test/input_test.py b/verarbeitung/test/input_test.py index 7164e234dd55b65bd5804744d6535a50b7b9cb68..f9839a2e3b8b7fd4a87aaefd17018d5c9b863cb0 100644 --- a/verarbeitung/test/input_test.py +++ b/verarbeitung/test/input_test.py @@ -1,3 +1,21 @@ +# -*- coding: utf-8 -*- +""" +Functions implement an test input function analogue to the one from the input group. +Also implements example graphs + +""" + +__authors__ = "Donna Löding, Alina Molkentin, Judith Große, Malte Schokolowski" +__email__ = "cis-project2021@zbh.uni-hamburg.de" +__status__ = "Finished" + +# __copyright__ = "" +# __credits__ = ["", "", "", ""] +# __license__ = "" +# __version__ = "" +# __maintainer__ = "" + + import sys sys.path.append("../") @@ -23,8 +41,11 @@ def input_test_func(pub_doi): def cit(list_doi, cit_type): ''' - :param list_doi list of citation DOIs to get their Citation Class - :type list_doi: List[String] + :param list_doi: list of citation DOIs to get their Citation Class + :type list_doi: List[String] + + :param cit_type: variable to differentiate citation and reference call + :type cit_type: String returns a list of citations objects for given DOI list ''' @@ -37,6 +58,8 @@ def cit(list_doi, cit_type): return cits +# large_graph_1, large_graph_2 and crossed_graph are visualized in test_graphs_plan.pdf + beispiel1 = ['doi1', 'title1', ['contributor1'], 'journal1', 'date1', ['subject1'], ['doi2'], ['doi3']] beispiel2 = ['doi2', 'title2', ['contributor2'], 'journal2', 'date2', ['subject2'], [], ['doi1']] beispiel3 = ['doi3', 'title3', ['contributor3'], 'journal3', 'date3', ['subject3'], ['doi1'], []] diff --git a/verarbeitung/test/update_graph_unittest.py b/verarbeitung/test/update_graph_unittest.py index 1ebb0d10d885b4f8aea45a9b59a487ed25988197..ae0e3a4dec85ae6d3f10d25206d62bb64c2049fb 100644 --- a/verarbeitung/test/update_graph_unittest.py +++ b/verarbeitung/test/update_graph_unittest.py @@ -1,5 +1,21 @@ -import unittest +# -*- coding: utf-8 -*- +""" +Functions to unittest functions which are updating a known graph + +""" + +__authors__ = "Donna Löding, Alina Molkentin, Judith Große, Malte Schokolowski" +__email__ = "cis-project2021@zbh.uni-hamburg.de" +__status__ = "Finished" +# __copyright__ = "" +# __credits__ = ["", "", "", ""] +# __license__ = "" +# __version__ = "" +# __maintainer__ = "" + + +import unittest import sys from pathlib import Path @@ -79,7 +95,7 @@ class UpdatingTest(unittest.TestCase): self.assertCountEqual(new_nodes, nodes) self.assertCountEqual(new_edges, edges) - ## From here the tests for the individual functions ## + ## From here: tests for the individual functions ## # update_graph.py: diff --git a/verarbeitung/test_output.json b/verarbeitung/test_output.json deleted file mode 100644 index 0cc59401969ce8a984fee556a67099167f2144a6..0000000000000000000000000000000000000000 --- a/verarbeitung/test_output.json +++ /dev/null @@ -1 +0,0 @@ -{"nodes": [{"doi": "doi_lg_1_i", "name": "title_lg_1_i", "author": ["contributor_lg_1_i"], "year": "date_lg_1_i", "journal": "journal_lg_1_i", "abstract": null, "group": "Input", "depth": 0, "citations": 2}, {"doi": "doi_lg_1_d11", "name": "title_lg_1_d11", "author": ["contributor_lg_1_d11"], "year": "date_lg_1_d11", "journal": "journal_lg_1_d11", "abstract": null, "group": "Reference", "depth": -1, "citations": 1}, {"doi": "doi_lg_1_d12", "name": "title_lg_1_d12", "author": ["contributor_lg_1_d12"], "year": "date_lg_1_d12", "journal": "journal_lg_1_d12", "abstract": null, "group": "Reference", "depth": -1, "citations": 2}, {"doi": "doi_lg_1_h11", "name": "title_lg_1_h11", "author": ["contributor_lg_1_h11"], "year": "date_lg_1_h11", "journal": "journal_lg_1_h11", "abstract": null, "group": "Citedby", "depth": 1, "citations": 2}, {"doi": "doi_lg_1_h12", "name": "title_lg_1_h12", "author": ["contributor_lg_1_h12"], "year": "date_lg_1_h12", "journal": "journal_lg_1_h12", "abstract": null, "group": "Citedby", "depth": 1, "citations": 2}, {"doi": "doi_lg_1_h21", "name": "title_lg_1_h21", "author": ["contributor_lg_1_h21"], "year": "date_lg_1_h21", "journal": "journal_lg_1_h21", "abstract": null, "group": "Citedby", "depth": 2, "citations": 0}, {"doi": "doi_lg_1_h22", "name": "title_lg_1_h22", "author": ["contributor_lg_1_h22"], "year": "date_lg_1_h22", "journal": "journal_lg_1_h22", "abstract": null, "group": "Citedby", "depth": 2, "citations": 0}, {"doi": "doi_lg_1_h23", "name": "title_lg_1_h23", "author": ["contributor_lg_1_h23"], "year": "date_lg_1_h23", "journal": "journal_lg_1_h23", "abstract": null, "group": "Citedby", "depth": 2, "citations": 0}, {"doi": "doi_lg_1_d21", "name": "title_lg_1_d21", "author": ["contributor_lg_1_d21"], "year": "date_lg_1_d21", "journal": "journal_lg_1_d21", "abstract": null, "group": "Reference", "depth": -2, "citations": 2}, {"doi": "doi_lg_1_d22", "name": "title_lg_1_d22", "author": ["contributor_lg_1_d22"], "year": "date_lg_1_d22", "journal": "journal_lg_1_d22", "abstract": null, "group": "Reference", "depth": -2, "citations": 2}, {"doi": "doi_lg_1_d23", "name": "title_lg_1_d23", "author": ["contributor_lg_1_d23"], "year": "date_lg_1_d23", "journal": "journal_lg_1_d23", "abstract": null, "group": "Reference", "depth": -2, "citations": 2}], "links": [{"source": "doi_lg_1_i", "target": "doi_lg_1_d11"}, {"source": "doi_lg_1_i", "target": "doi_lg_1_d12"}, {"source": "doi_lg_1_h11", "target": "doi_lg_1_i"}, {"source": "doi_lg_1_h12", "target": "doi_lg_1_i"}, {"source": "doi_lg_1_h21", "target": "doi_lg_1_h11"}, {"source": "doi_lg_1_h22", "target": "doi_lg_1_h11"}, {"source": "doi_lg_1_h22", "target": "doi_lg_1_h12"}, {"source": "doi_lg_1_h23", "target": "doi_lg_1_h12"}, {"source": "doi_lg_1_d11", "target": "doi_lg_1_d21"}, {"source": "doi_lg_1_d11", "target": "doi_lg_1_d22"}, {"source": "doi_lg_1_d12", "target": "doi_lg_1_d23"}, {"source": "doi_lg_1_d21", "target": "doi_lg_1_d22"}, {"source": "doi_lg_1_d22", "target": "doi_lg_1_d21"}, {"source": "doi_lg_1_h12", "target": "doi_lg_1_d12"}], "depth_height": [2, 2]} \ No newline at end of file diff --git a/verarbeitung/update_graph/compare_old_and_new_node_lists.py b/verarbeitung/update_graph/compare_old_and_new_node_lists.py index a899a8a7fd6b405af423780be599e6affde76a67..2b1102b6cdead683c9e03d85f6fb1cb0fadb5f80 100644 --- a/verarbeitung/update_graph/compare_old_and_new_node_lists.py +++ b/verarbeitung/update_graph/compare_old_and_new_node_lists.py @@ -1,4 +1,20 @@ -#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" + compares old and new input list to find common, deleted and inserted input dois. + +""" + +__authors__ = "Donna Löding, Alina Molkentin, Judith Große, Malte Schokolowski" +__email__ = "cis-project2021@zbh.uni-hamburg.de" +__status__ = "Finished" + +# __copyright__ = "" +# __credits__ = ["", "", "", ""] +# __license__ = "" +# __version__ = "" +# __maintainer__ = "" + + from collections import Counter @@ -29,11 +45,5 @@ def compare_old_and_new_node_lists(old_doi_node_list, new_doi_node_list): elif ((doi in dois_from_new_graph) & ( doi not in dois_from_old_graph)): # if the DOI occurs ince and it is from new graph it is a inserted node inserted_nodes.append(doi) # appends the DOI to the inserted ones + return (common_nodes, inserted_nodes, deleted_nodes) - -# Test Prints -# liste_1 = ["doi_1","doi_2","doi_3","doi_4","doi_5"] -# liste_2 = ["doi_1","doi_2","doi_3","doi_6","doi_7"] -# print("gemeinsame Elemente: ",doi_listen_vergleichen(liste_1,liste_2)[0]) -# print("hinzugefügte Elemente: ",doi_listen_vergleichen(liste_1,liste_2)[1]) -# print("gelöschte Elemente: ",doi_listen_vergleichen(liste_1,liste_2)[2]) diff --git a/verarbeitung/update_graph/connect_new_input.py b/verarbeitung/update_graph/connect_new_input.py index 6beb4a88e1a5424011d45ac1479c63268445307b..45cb55c1cf200b8902d776ab5b721d57a2b22602 100644 --- a/verarbeitung/update_graph/connect_new_input.py +++ b/verarbeitung/update_graph/connect_new_input.py @@ -6,7 +6,7 @@ Functions to update a graph representing citations between multiple ACS/Nature j __authors__ = "Donna Löding, Alina Molkentin, Judith Große, Malte Schokolowski" __email__ = "cis-project2021@zbh.uni-hamburg.de" -__status__ = "Production" +__status__ = "Finished" # __copyright__ = "" # __credits__ = ["", "", "", ""] @@ -23,10 +23,34 @@ sys.path.append("../") from input.publication import Publication from verarbeitung.get_pub_from_input import get_pub from verarbeitung.construct_new_graph.initialize_graph import init_graph_construction -from verarbeitung.construct_new_graph.add_citations_rec import add_citations, get_cit_type_list, create_global_lists_cit +from verarbeitung.construct_new_graph.add_citations_rec import add_citations, get_cit_type_list def find_furthermost_citations_test(test_nodes, test_edges, changed_node, old_search_depth, new_search_depth, cit_type): + """ + :param test_nodes: list of publications from unit test + :type test_nodes: List[Publication] + + :param test_edges: list of links from unit test + :type test_edges: List[List[String,String]] + + :param new_nodes: list of nodes which are generated separately from main node list to avoid recursive problems + :type new_nodes List[Publication] + + :param new_edges: list of edges which are generated separately from main edge list to avoid recursive problems + :type new_edges: List[List[String,String]] + + :param node: node which is known but not from input group + :type node: Publication + + :param old_search_depth: depth to search for references from old construction call + :type old_search_depth: int + + :param cit_type: determines whether the function call is for a reference or citation + :type cit_type: String + + for unit test purposes only + """ global nodes, edges nodes = test_nodes edges = test_edges @@ -36,6 +60,30 @@ def find_furthermost_citations_test(test_nodes, test_edges, changed_node, old_se def complete_changed_group_nodes_test(test_nodes, test_edges, inserted_test_nodes, old_search_depth, old_search_height, new_search_depth, new_search_height): + """ + :param test_nodes: list of publications from unit test + :type test_nodes: List[Publication] + + :param test_edges: list of links from unit test + :type test_edges: List[List[String,String]] + + :param inserted_nodes: list of nodes which are inserted to new input array + :type inserted_nodes: List[String] + + :param old_search_depth: depth to search for references from old construction call + :type old_search_depth: int + + :param old_search_height: height to search for citations from old construction call + :type old_search_height: int + + :param new_search_depth: depth to search for references from new construction call + :type new_search_depth: int + + :param new_search_height: height to search for citations from new construction call + :type new_search_height: int + + for unit test purposes only + """ global nodes, edges nodes = test_nodes edges = test_edges diff --git a/verarbeitung/update_graph/delete_nodes_edges.py b/verarbeitung/update_graph/delete_nodes_edges.py index b7af110fb0f554cf29471c76a029278c86116708..6836ccfa316748ff4a19fc2c954f299c060c7570 100644 --- a/verarbeitung/update_graph/delete_nodes_edges.py +++ b/verarbeitung/update_graph/delete_nodes_edges.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- """ -Functions to remove publications/links from nodes/edges list, if they can no longer be reached + Functions to remove publications/links from nodes/edges list, if they can no longer be reached """ __authors__ = "Donna Löding, Alina Molkentin, Judith Große, Malte Schokolowski" __email__ = "cis-project2021@zbh.uni-hamburg.de" -__status__ = "Production" +__status__ = "Finished" #__copyright__ = "" #__credits__ = ["", "", "", ""] #__license__ = "" @@ -20,15 +20,31 @@ sys.path.append("../../") from .update_edges import back_to_valid_edges -def search_ref_cit_graph_rec_test(pubs, new_test_input, old_max_depth, cit_var): +def search_ref_cit_graph_rec_test(pubs, new_test_input, old_max_depth, cit_type): + ''' + :param pub: pub go get appended to usable_nodes + :type pub: Publication + + :param new_test_input: current recursion depth + :type new_test_input: int + + :param old_max_depth: old max search depth + :type old_max_depth: int + + :param cit_type: variable to differentiate citation and reference call + :type cit_type: String + + for unit test purposes only + ''' + global usable_nodes, input_obj_list usable_nodes = [] input_obj_list = pubs - if cit_var == "Reference": + if cit_type == "Reference": for pub in new_test_input: search_ref_graph_rec(pub, 1, old_max_depth) - elif cit_var == "Citation": + elif cit_type == "Citation": for pub in new_test_input: search_cit_graph_rec(pub, 1, old_max_depth) return usable_nodes diff --git a/verarbeitung/update_graph/import_from_json.py b/verarbeitung/update_graph/import_from_json.py index 36b4778c5a8a27234ed341697657d05af1662c3c..62c464481a9561c79d6e10cbd9eceb2387082ec7 100644 --- a/verarbeitung/update_graph/import_from_json.py +++ b/verarbeitung/update_graph/import_from_json.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- """ -Functions to read old json files to recreate old graph structure + Functions to read old json files to recreate old graph structure """ __authors__ = "Donna Löding, Alina Molkentin, Judith Große, Malte Schokolowski" __email__ = "cis-project2021@zbh.uni-hamburg.de" -__status__ = "Production" +__status__ = "Finished" #__copyright__ = "" #__credits__ = ["", "", "", ""] #__license__ = "" @@ -24,7 +24,7 @@ from input.publication import Publication, Citation def create_pubs_from_json(input_dict): ''' :param input_dict: dictionary read from old graph json file - :type json_file: dictionary + :type input_dict: Dict creates list of publication retrieved from old json file ''' @@ -42,7 +42,7 @@ def create_pubs_from_json(input_dict): def add_ref_and_cit_to_pubs(input_dict): ''' :param input_dict: dictionary read from old graph json file - :type json_file: dictionary + :type input_dict: Dict adds references and citations to retrieved publication list ''' diff --git a/verarbeitung/update_graph/update_depth.py b/verarbeitung/update_graph/update_depth.py index 27607fcb5deb340cc5fd1fdd59034de8887b3eeb..640f919c0ad5ae26caead5b0d83e079e9afcc569 100644 --- a/verarbeitung/update_graph/update_depth.py +++ b/verarbeitung/update_graph/update_depth.py @@ -6,7 +6,7 @@ Functions to update the citation depth of recursive graph construction __authors__ = "Donna Löding, Alina Molkentin, Judith Große, Malte Schokolowski" __email__ = "cis-project2021@zbh.uni-hamburg.de" -__status__ = "Production" +__status__ = "Finished" #__copyright__ = "" #__credits__ = ["", "", "", ""] #__license__ = "" @@ -17,7 +17,6 @@ import sys sys.path.append("../../") from verarbeitung.construct_new_graph.add_citations_rec import add_citations -from verarbeitung.construct_new_graph.initialize_graph import complete_inner_edges from verarbeitung.get_pub_from_input import get_pub from .update_edges import back_to_valid_edges from input.publication import Publication @@ -101,6 +100,9 @@ def get_old_max_references(old_depth, test_var): :param old_depth: old maximum depth to search for citations :type old_depth: int + :param test_var: variable to differentiate between test and url call + :type test_var: boolean + function to get references for new recursive levels ''' old_max_references = [] @@ -119,6 +121,9 @@ def get_old_max_citations(old_height, test_var): :param old_height: old maximum height to search for citations :type old_height: int + :param test_var: variable to differentiate between test and url call + :type test_var: boolean + function to get citations for new recursive levels ''' old_max_citations = [] diff --git a/verarbeitung/update_graph/update_edges.py b/verarbeitung/update_graph/update_edges.py index 45a7bded40d8e6328e0addf524d05d9de66d2477..bb174da3a5c1f0a1fdfda0313f2e0fe1e592c6e5 100644 --- a/verarbeitung/update_graph/update_edges.py +++ b/verarbeitung/update_graph/update_edges.py @@ -1,4 +1,18 @@ -#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" + returns the edges list to a valid state after node deletion + +""" + +__authors__ = "Donna Löding, Alina Molkentin, Judith Große, Malte Schokolowski" +__email__ = "cis-project2021@zbh.uni-hamburg.de" +__status__ = "Finished" + +# __copyright__ = "" +# __credits__ = ["", "", "", ""] +# __license__ = "" +# __version__ = "" +# __maintainer__ = "" def back_to_valid_edges(links_from_json, processed_input_list): ''' diff --git a/verarbeitung/update_graph/update_graph.py b/verarbeitung/update_graph/update_graph.py index 2bb93c3f851862398f9287173987deb6e2279e21..a92ff7dfa0e1363a28c53ed16a5e26d990c297c6 100644 --- a/verarbeitung/update_graph/update_graph.py +++ b/verarbeitung/update_graph/update_graph.py @@ -6,7 +6,8 @@ Functions to update a graph representing citations between multiple ACS/Nature j __authors__ = "Donna Löding, Alina Molkentin, Judith Große, Malte Schokolowski" __email__ = "cis-project2021@zbh.uni-hamburg.de" -__status__ = "Production" +__status__ = "Finished" + #__copyright__ = "" #__credits__ = ["", "", "", ""] #__license__ = "" @@ -16,7 +17,6 @@ __status__ = "Production" import sys - sys.path.append("../../") from input.publication import Publication @@ -59,6 +59,7 @@ def get_new_input_dois(new_input, test_var): # new list to save doi_url for each new input url new_input_dois = [] for new_node in new_input: + # retrieves information and adds to new list if successful pub = get_pub(new_node, test_var) if (type(pub) != Publication): @@ -111,12 +112,14 @@ def update_graph(new_doi_input_list, json_file, search_depth, search_height, tes if (len(deleted_nodes) > 0): processed_list, valid_edges = delete_nodes_and_edges(processed_list, common_nodes, valid_edges, old_search_depth, old_search_height) + # returns new lists for nodes and edges if new input dois exist if (len(inserted_nodes) > 0): inserted_pub_nodes, inserted_edges, error_doi_list_new = connect_old_and_new_input(processed_list_copy, valid_edges_copy, inserted_nodes, old_search_depth, old_search_height, search_depth, search_height, test_var) for err_node in error_doi_list_new: if err_node not in error_doi_list: error_doi_list.append(err_node) + # compares new list to processed list and adds new nodes/replaces known nodes for inserted_node in inserted_pub_nodes: not_in_nodes = True for node in processed_list: @@ -128,10 +131,12 @@ def update_graph(new_doi_input_list, json_file, search_depth, search_height, tes if not_in_nodes: processed_list.append(inserted_node) + # adds new edges to list valid_edges for inserted_edge in inserted_edges: if inserted_edge not in valid_edges: valid_edges.append(inserted_edge) + # calls function to find cross references between citation and reference group complete_inner_edges(True, processed_list, valid_edges) return(processed_list, valid_edges, error_doi_list)