From 4588d4d0cabcfb4def262dcdc78bd3eb6bab402a Mon Sep 17 00:00:00 2001 From: Malte Schokolowski <baw8441@uni-hamburg.de> Date: Wed, 19 Jan 2022 14:22:27 +0100 Subject: [PATCH] verarbeitung: bug fix in update_graph and connect_new_input --- .../construct_new_graph/add_citations_rec.py | 2 +- verarbeitung/start_script.py | 7 ++- .../update_graph/connect_new_input.py | 57 ++++++++++--------- verarbeitung/update_graph/update_graph.py | 5 +- 4 files changed, 41 insertions(+), 30 deletions(-) diff --git a/verarbeitung/construct_new_graph/add_citations_rec.py b/verarbeitung/construct_new_graph/add_citations_rec.py index c5f4dd0..dee665c 100644 --- a/verarbeitung/construct_new_graph/add_citations_rec.py +++ b/verarbeitung/construct_new_graph/add_citations_rec.py @@ -46,7 +46,7 @@ def get_cit_type_list(pub, cit_type): :param cit_type: variable to differenciate citation and reference call :type cit_type: String - function to create nodes and edges and call create_graph_structure_citations + function to return citation or reference list for given pub ''' if cit_type == "Citation": return(pub.citations) diff --git a/verarbeitung/start_script.py b/verarbeitung/start_script.py index 4cc48e4..43e9bcc 100644 --- a/verarbeitung/start_script.py +++ b/verarbeitung/start_script.py @@ -6,13 +6,16 @@ from verarbeitung.process_main import Processing doi_list = [] -doi_list.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249') +#doi_list.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249') #doi_list.append('https://doi.org/10.1021/acs.jcim.9b00249') #doi_list.append('https://pubs.acs.org/doi/10.1021/acs.jcim.1c00203') #doi_list.append('https://doi.org/10.1021/acs.jmedchem.0c01332') #doi_list.append('https://pubs.acs.org/doi/10.1021/acs.jcim.6b00709') #doi_list.append('https://doi.org/10.1021/acs.chemrev.8b00728') -error_list = Processing(doi_list, 2, 2, 'test.json') +#doi_list.append('https://pubs.acs.org/doi/10.1021/acs.chemrestox.0c00006')# +doi_list.append('https://doi.org/10.1021/acs.chemrev.8b00728') +doi_list.append('https://doi.org/10.1021/acs.jpclett.1c03335 ') +error_list = Processing(doi_list, 2, 2, 'test728.json') print(error_list) del doi_list diff --git a/verarbeitung/update_graph/connect_new_input.py b/verarbeitung/update_graph/connect_new_input.py index af7363a..2689efd 100644 --- a/verarbeitung/update_graph/connect_new_input.py +++ b/verarbeitung/update_graph/connect_new_input.py @@ -19,16 +19,19 @@ from os import error sys.path.append("../") +from input.publication import Publication +from verarbeitung.get_pub_from_input import get_pub from verarbeitung.construct_new_graph.initialize_graph import init_graph_construction from verarbeitung.construct_new_graph.add_citations_rec import add_citations, get_cit_type_list, create_global_lists_cit -def find_furthermost_citations_test(test_nodes, test_edges, changed_node, old_search_depth, cit_type): + +def find_furthermost_citations_test(test_nodes, test_edges, changed_node, old_search_depth, new_search_depth, cit_type): global nodes, edges nodes = test_nodes edges = test_edges - return(find_furthermost_citations(nodes, edges, changed_node, old_search_depth, cit_type)) + return(find_furthermost_citations(nodes, edges, changed_node, old_search_depth, new_search_depth, cit_type)) def complete_changed_group_nodes_test(test_nodes, test_edges, inserted_test_nodes, old_search_depth, old_search_height, new_search_depth, new_search_height): global nodes, edges @@ -40,7 +43,7 @@ def complete_changed_group_nodes_test(test_nodes, test_edges, inserted_test_node -def find_furthermost_citations(new_nodes, new_edges, node, old_search_depth, cit_type): +def find_furthermost_citations(new_nodes, new_edges, node, old_search_depth, new_search_depth, cit_type): ''' :param new_nodes: list of nodes which are generated seperately from main node list to avoid recursive problems :type new_nodes List[Publication] @@ -64,7 +67,7 @@ def find_furthermost_citations(new_nodes, new_edges, node, old_search_depth, cit citations_saved = [node] # group of node and old search depth/height determines how often the loop needs to be repeated - for depth in range(old_search_depth - abs(node.group)): + for depth in range(min(old_search_depth - abs(node.group), new_search_depth)): new_citations = [] for citation in citations_saved: for cit_node in nodes: @@ -147,36 +150,38 @@ def complete_changed_group_nodes(inserted_nodes, old_search_depth, old_search_he # moves known reference node to input and completes citations and references for this node if (node.group < 0) and (node.doi_url in inserted_nodes): - node.group = 0 - new_max_citations = find_furthermost_citations(new_nodes, new_edges, node, old_search_height + abs(node.group), "Citation") - add_citations(new_nodes, new_edges, new_max_citations, old_search_height, new_search_height, "Citation", test_var) - - new_nodes, new_edges, error_doi_list_ref = init_graph_construction([node.doi_url], new_search_height, 0, test_var, True, new_nodes, new_edges) - - for err_node in error_doi_list_ref: - if err_node not in error_doi_list: - error_doi_list.append(err_node) + pub = get_pub(node.doi_url, test_var) + if (type(pub) != Publication): - + error_doi_list.append(node.doi_url) + continue + + pub.group = node.group old_max_references = find_furthermost_citations(new_nodes, new_edges, node, old_search_depth, "Reference") - add_citations(new_nodes, new_edges, old_max_references, old_search_depth, new_search_depth, "Reference", test_var) + add_citations(new_nodes, new_edges, old_max_references, min(old_search_depth - abs(node.group), new_search_depth), new_search_depth, "Reference", test_var) + + add_citations(new_nodes, new_edges, [pub], 0, new_search_height, "Citation", test_var) + + pub.group = 0 + new_nodes.append(pub) handled_inserted_nodes.append(node) # moves known citation node to input and completes citations and references for this node elif (node.group > 0) and (node.doi_url in inserted_nodes): - node.group = 0 - new_max_references = find_furthermost_citations(new_nodes, new_edges, node, old_search_depth + abs(node.group), "Reference") - add_citations(new_nodes, new_edges, new_max_references, old_search_depth, new_search_depth, "Reference", test_var) - #new_nodes.append(new_max_references) - - new_nodes, new_edges, error_doi_list_ref = init_graph_construction([node.doi_url], new_search_depth, 0, test_var, True, new_nodes, new_edges) - for err_node in error_doi_list_ref: - if err_node not in error_doi_list: - error_doi_list.append(err_node) + pub = get_pub(node.doi_url, test_var) + if (type(pub) != Publication): + + error_doi_list.append(node.doi_url) + continue + + pub.group = node.group + old_max_citations = find_furthermost_citations(new_nodes, new_edges, pub, old_search_height, new_search_height, "Citation") + add_citations(new_nodes, new_edges, old_max_citations, min(old_search_height - abs(node.group), new_search_height), new_search_height, "Citation", test_var) + add_citations(new_nodes, new_edges, [pub], 0, new_search_depth, "Reference", test_var) - old_max_citations = find_furthermost_citations(new_nodes, new_edges, node, old_search_height, "Citation") - add_citations(new_nodes, new_edges, old_max_citations, old_search_height, new_search_height, "Citation", test_var) + pub.group = 0 + new_nodes.append(pub) handled_inserted_nodes.append(node) for new_node in new_nodes: diff --git a/verarbeitung/update_graph/update_graph.py b/verarbeitung/update_graph/update_graph.py index 416be51..1e74773 100644 --- a/verarbeitung/update_graph/update_graph.py +++ b/verarbeitung/update_graph/update_graph.py @@ -101,15 +101,18 @@ def update_graph(new_doi_input_list, json_file, search_depth, search_height, tes # retrieve which publications are already known, removed, inserted common_nodes, inserted_nodes, deleted_nodes = compare_old_and_new_node_lists(old_doi_input_list, new_doi_input_list) - old_search_depth, old_search_height = update_depth(processed_list, valid_edges, search_depth, search_height, test_var) + processed_list_copy = processed_list.copy() valid_edges_copy = valid_edges.copy() + old_search_depth, old_search_height = update_depth(processed_list, valid_edges, search_depth, search_height, test_var) + # deletes publications and edges from node_list if publications can no longer be reached if (len(deleted_nodes) > 0): processed_list, valid_edges = delete_nodes_and_edges(processed_list, common_nodes, valid_edges) + if (len(inserted_nodes) > 0): inserted_pub_nodes, inserted_edges, error_doi_list_new = connect_old_and_new_input(processed_list_copy, valid_edges_copy, inserted_nodes, old_search_depth, old_search_height, search_depth, search_height, test_var) for err_node in error_doi_list_new: -- GitLab