diff --git a/verarbeitung/Processing.py b/verarbeitung/Processing.py index ab7db1d5c46193c14637f10e28fd163dee81b10e..0dcc7391bd5a633a86841f6097f486017ae94dfa 100644 --- a/verarbeitung/Processing.py +++ b/verarbeitung/Processing.py @@ -23,12 +23,13 @@ from json_demo import output_to_json # adds every publication from input list to graph structure # doi_input_list: list of publication dois from user +def initialize_nodes_list(doi_input_list, search_depth_max, search_height_max, test_var): + references_pub_obj_list = [] + citations_pub_obj_list = [] -# TO-DO: Listenelemente auf Korrektheit überprüfen -def initialize_nodes_list(doi_input_list, test_var): for pub_doi in doi_input_list: - # checks if its a test and chooses input function accordingly + #checks if its a test and chooses input function accordingly if(test_var): pub = input_test_func(pub_doi) else: @@ -46,60 +47,40 @@ def initialize_nodes_list(doi_input_list, test_var): else: doi_input_list.remove(pub_doi) -# adds inner edges between citations and references to edges -def complete_inner_edges(test_var): - for node in nodes: - - # checks if its a test and chooses input function accordingly - if (test_var): - pub = input_test_func(node.doi_url) - else: - pub = input(node.doi_url) + # inserts references as publication objects into list and + # inserts first depth references into nodes/edges if maximum search depth > 0 + for reference in create_graph_structure_references(pub, 0, search_depth_max, test_var): + references_pub_obj_list.append(reference) + # inserts citations as publication objects into list and + # inserts first height citations into nodes if maximum search height > 0 + for citation in create_graph_structure_citations(pub, 0, search_height_max, test_var): + citations_pub_obj_list.append(citation) + return(references_pub_obj_list, citations_pub_obj_list) + + +# adds edges between citation and reference group +def complete_inner_edges(test_var): + for node in nodes: if (node.group == "depth"): - for citation in pub.citations: - if (citation in nodes and [citation.doi_url, pub.doi_url] not in edges): - edges.append([citation.doi_url, pub.doi_url]) - + for citation in node.citations: + for cit in nodes: + if (citation.doi_url == cit.doi_url and [citation.doi_url, node.doi_url] not in edges): + edges.append([citation.doi_url, node.doi_url]) if (node.group == "height"): - for reference in pub.references: - for node in nodes: - if (reference.doi_url in node.doi_url and [pub.doi_url, reference.doi_url] not in edges): - edges.append([pub.doi_url,reference.doi_url]) - - - -# adds a node for every publication unknown -# adds edges for citations between publications -def create_graph_structure_citations(pub, search_height, search_height_max): - for citation in pub.citations: - - # checks if publication already exists in nodes - not_in_nodes = True - for node in nodes: - # checks every citation for duplication - if (citation.doi_url == node.doi_url): - not_in_nodes = False - break - if (not_in_nodes): - if (search_height <= search_height_max): - citation.group = "height" - nodes.append(citation) - edges.append([citation.doi_url,pub.doi_url]) - - # adds only an edge (citation already exists) - elif [citation.doi_url,pub.doi_url] not in edges: - edges.append([citation.doi_url,pub.doi_url]) + for reference in node.references: + for ref in nodes: + if (reference.doi_url == ref.doi_url and [node.doi_url, reference.doi_url] not in edges): + edges.append([node.doi_url,reference.doi_url]) # adds a node for every publication unknown # adds edges for references between publications -def create_graph_structure_references(pub, search_depth, search_depth_max): +def create_graph_structure_references(pub, search_depth, search_depth_max, test_var): + references_pub_obj_list = [] for reference in pub.references: - - # checks if publication already exists in nodes not_in_nodes = True for node in nodes: # checks every reference for duplication @@ -107,86 +88,94 @@ def create_graph_structure_references(pub, search_depth, search_depth_max): not_in_nodes = False break if (not_in_nodes): - if (search_depth <= search_depth_max): - reference.group = "depth" - nodes.append(reference) - edges.append([pub.doi_url,reference.doi_url]) + if (search_depth < search_depth_max): - # adds only an edge (citation already exists) - elif [pub.doi_url,reference.doi_url] not in edges: - edges.append([pub.doi_url,reference.doi_url]) - + #checks if its a test and chooses input function accordingly + if (test_var): + reference_pub_obj = input_test_func(reference.doi_url) + else: + reference_pub_obj = input(reference.doi_url) + reference_pub_obj.group = "depth" + nodes.append(reference_pub_obj) + edges.append([pub.doi_url,reference_pub_obj.doi_url]) + references_pub_obj_list.append(reference_pub_obj) -# recursive function to implement height-first-search on citations -# doi_citations: input list of citet dois -# search_height: current search_height of height-first-search -# search_height_max: maximal search_height for dfs -def process_citations_rec(doi_citations, search_height, search_height_max, test_var): - # height of search is increased by 1 with each recursive call - search_height += 1 + # adds edge only if citation already exists + elif [pub.doi_url,reference.doi_url] not in edges: + edges.append([pub.doi_url,reference.doi_url]) + return references_pub_obj_list - # create class object for every citation from list - for pub_doi in doi_citations: - # checks if its a test and chooses input function accordingly - if (test_var): - pub = input_test_func(pub_doi) - else: - pub = input(pub_doi) +# recursive function to implement height-first-search on references +# references_pub_obj_list: input list of references as publication objects +# search_depth: current search_depth of height-first-search +# search_depth_max: maximal search_depth for dfs +def process_references_rec(references_pub_obj_list, search_depth, search_depth_max, test_var): + # adds next level to nodes/edges + for pub in references_pub_obj_list: + new_reference_pub_obj_list = create_graph_structure_references(pub, search_depth, search_depth_max, test_var) - create_graph_structure_citations(pub, search_height, search_height_max) - # If the maximum height has not yet been reached, all references from the publication - # are written to an array and the function is called again with this array. - if (search_height < search_height_max): - citations_list = [] - for citation in pub.citations: + # If the maximum height has not yet been reached, calls function recursivly with increased height + if (search_depth < search_depth_max): + process_references_rec(new_reference_pub_obj_list, search_depth+1, search_depth_max, test_var) - # currently only the references with acs are stored in the URL, because we can't - # extract the info from other sources. - if ("acs" in citation.doi_url or test_var == True): - citations_list.append(citation.doi_url) - # recursive call of function. - process_citations_rec(citations_list, search_height, search_height_max, test_var) + +# adds a node for every publication unknown +# adds edges for citations between publications +def create_graph_structure_citations(pub, search_height, search_height_max, test_var): + citations_pub_obj_list = [] + for citation in pub.citations: + not_in_nodes = True + for node in nodes: + # checks every citation for duplication + if (citation.doi_url == node.doi_url): + not_in_nodes = False + break + if (not_in_nodes): + if (search_height < search_height_max): + #checks if its a test and chooses input function accordingly + if (test_var): + citation_pub_obj = input_test_func(citation.doi_url) + else: + citation_pub_obj = input(citation.doi_url) -# recursive function to implement height-first-search on references -# doi_references: input list of referenced dois -# search_depth: current search_depth of height-first-search -# search_depth_max: maximal search_depth for dfs -def process_references_rec(doi_references, search_depth, search_depth_max, test_var): - # The depth is increased by 1 with each recursive call - search_depth += 1 + citation_pub_obj.group = "height" + nodes.append(citation_pub_obj) + edges.append([citation_pub_obj.doi_url,pub.doi_url]) + citations_pub_obj_list.append(citation_pub_obj) - # create class object for every citation from list - for pub_doi in doi_references: + # adds only edge if citation already exists + elif [citation.doi_url,pub.doi_url] not in edges: + edges.append([citation.doi_url,pub.doi_url]) + return citations_pub_obj_list - #checks if its a test and chooses input function accordingly - if (test_var): - pub = input_test_func(pub_doi) - else: - pub = input(pub_doi) - create_graph_structure_references(pub, search_depth, search_depth_max) - # If the maximum depth has not yet been reached, all references from the publication - # are written to an array and the function is called again with this array. - if (search_depth < search_depth_max): - references_list = [] - for reference in pub.references: - # currently only the references with acs are stored in the URL, because we can't - # extract the info from other sources. - if ("acs" in reference.doi_url or test_var == True): - references_list.append(reference.doi_url) +# recursive function to implement height-first-search on citations +# citations_pub_obj_list: input list of citations as publication objects +# search_height: current search_height of height-first-search +# search_height_max: maximal search_height for dfs +def process_citations_rec(citations_pub_obj_list, search_height, search_height_max, test_var): + # adds next level to nodes/edges + for pub in citations_pub_obj_list: + new_citation_pub_obj_list = create_graph_structure_citations(pub, search_height, search_height_max, test_var) + + # If the maximum height has not yet been reached, calls function recursivly with increased height + if (search_height < search_height_max): + process_citations_rec(new_citation_pub_obj_list, search_height+1, search_height_max, test_var) - # recursive call of function. - process_references_rec(references_list, search_depth, search_depth_max, test_var) - +# main function to call. Needs as input: +# doi_input_list: input list of dois +# search_height: max search height to process to +# search_depth: max search depth to process to +# test_var: only needed for unit test as True, default is False def process_main(doi_input_list, search_height, search_depth, test_var = False): # ERROR-Handling doi_array = NULL if (len(doi_input_list) == 0): @@ -206,24 +195,30 @@ def process_main(doi_input_list, search_height, search_depth, test_var = False): nodes = [] edges = [] + # initializes nodes/edges from input and gets a list with publication objects for citations and references returned + references_obj_list, citations_obj_list = initialize_nodes_list(doi_input_list,search_depth, search_height, test_var) + + # function calls to begin recursive processing up to max depth/height + process_citations_rec(citations_obj_list, 1, search_height, test_var) + process_references_rec(references_obj_list, 1, search_depth, test_var) - initialize_nodes_list(doi_input_list,test_var) - process_citations_rec(doi_input_list, 0, search_height, test_var) - process_references_rec(doi_input_list, 0, search_depth, test_var) + # adds edges between reference group and citation group of known publications complete_inner_edges(test_var) + # calls a skript to save nodes and edges of graph in .json file output_to_json(nodes,edges) - # only for internal testing - doi_nodes = [] - for node in nodes: - doi_nodes.append(node.doi_url) - return(doi_nodes,edges) + # only for unit tests + if (test_var == True): + doi_nodes_list = [] + for node in nodes: + doi_nodes_list.append(node.doi_url) + return(doi_nodes_list, edges) - +# a function to print nodes and edges from a graph def print_graph(nodes, edges): print("Knoten:\n") for node in nodes: @@ -232,25 +227,13 @@ def print_graph(nodes, edges): for edge in edges: print(edge,"\n") - -# function to test cycles -def test_cycle(): - arr = [] - arr.append('doiz1') - #arr.append('doiz2') - - nodes,edges = process_main(arr,1,1,True) - - print(nodes, edges) - - print_graph(nodes, edges) -# program test, because there is no connection to the input yet. -def test_print(): - arr = [] - #arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249') +# program test, because there is no connection to UI yet. +def try_known_publications(): + doi_list = [] + doi_list.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249') #arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249') - #arr.append('https://doi.org/10.1021/acs.jmedchem.0c01332') + doi_list.append('https://doi.org/10.1021/acs.jmedchem.0c01332') #arr.append('https://doi.org/10.1021/acs.jcim.0c00741') #arr.append('https://doi.org/10.1021/ci700007b') @@ -259,13 +242,6 @@ def test_print(): #arr.append[url] - nodes,edges = process_main(arr,2,2,True) - - print_graph(nodes, edges) - -#test_print() -#test_cycle() -#print(process_main(['doiz1'],1,1,True)) -#print(process_main(['doi1'],0,0,True)) + nodes,edges = process_main(doi_list,2,2) - \ No newline at end of file + print_graph(nodes, edges) \ No newline at end of file diff --git a/verarbeitung/Processing_pub_objs_only.py b/verarbeitung/Processing_pub_objs_only.py deleted file mode 100644 index a6c1ed30aaa2e775c87979921895df9c88867d4d..0000000000000000000000000000000000000000 --- a/verarbeitung/Processing_pub_objs_only.py +++ /dev/null @@ -1,255 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Functions to generate a graph representing citations between multiple ACS/Nature journals - -""" - -__authors__ = "Donna Löding, Alina Molkentin, Xinyi Tang, Judith Große, Malte Schokolowski" -__email__ = "cis-project2021@zbh.uni-hamburg.de" -__status__ = "Production" -#__copyright__ = "" -#__credits__ = ["", "", "", ""] -#__license__ = "" -#__version__ = "" -#__maintainer__ = "" - -from bs4 import BeautifulSoup as bs -import requests as req -import sys -from pathlib import Path -from input_fj import input -from input_test import input_test_func -from json_demo import output_to_json - -# adds every publication from input list to graph structure -# doi_input_list: list of publication dois from user -def initialize_nodes_list(doi_input_list, search_depth_max, search_height_max, test_var): - references_pub_obj_list = [] - citations_pub_obj_list = [] - - for pub_doi in doi_input_list: - - #checks if its a test and chooses input function accordingly - if(test_var): - pub = input_test_func(pub_doi) - else: - pub = input(pub_doi) - - # checks if publication already exists in nodes - not_in_nodes = True - for node in nodes: # checks if a pub is already in nodes - if (pub.doi_url == node.doi_url): - not_in_nodes = False - break - if (not_in_nodes): - nodes.append(pub) - pub.group = "input" - else: - doi_input_list.remove(pub_doi) - - # inserts references as publication objects into list and - # inserts first depth references into nodes/edges if maximum search depth > 0 - for reference in create_graph_structure_references(pub, 0, search_depth_max, test_var): - references_pub_obj_list.append(reference) - - # inserts citations as publication objects into list and - # inserts first height citations into nodes if maximum search height > 0 - for citation in create_graph_structure_citations(pub, 0, search_height_max, test_var): - citations_pub_obj_list.append(citation) - - return(references_pub_obj_list, citations_pub_obj_list) - - -# adds edges between citation and reference group -def complete_inner_edges(test_var): - for node in nodes: - if (node.group == "depth"): - for citation in node.citations: - if (citation in nodes and [citation.doi_url, node.doi_url] not in edges): - edges.append([citation.doi_url, node.doi_url]) - if (node.group == "height"): - for reference in node.references: - if (reference in nodes and [node.doi_url, reference.doi_url] not in edges): - edges.append([node.doi_url,reference.doi_url]) - - - - -# adds a node for every publication unknown -# adds edges for references between publications -def create_graph_structure_references(pub, search_depth, search_depth_max, test_var): - references_pub_obj_list = [] - for reference in pub.references: - not_in_nodes = True - for node in nodes: - # checks every reference for duplication - if (reference.doi_url == node.doi_url): - not_in_nodes = False - break - if (not_in_nodes): - if (search_depth < search_depth_max): - - #checks if its a test and chooses input function accordingly - if (test_var): - reference_pub_obj = input_test_func(reference.doi_url) - else: - reference_pub_obj = input(reference.doi_url) - - reference_pub_obj.group = "depth" - nodes.append(reference_pub_obj) - edges.append([pub.doi_url,reference_pub_obj.doi_url]) - references_pub_obj_list.append(reference_pub_obj) - - # adds edge only if citation already exists - elif [pub.doi_url,reference.doi_url] not in edges: - edges.append([pub.doi_url,reference.doi_url]) - return references_pub_obj_list - - -# recursive function to implement height-first-search on references -# references_pub_obj_list: input list of references as publication objects -# search_depth: current search_depth of height-first-search -# search_depth_max: maximal search_depth for dfs -def process_references_rec(references_pub_obj_list, search_depth, search_depth_max, test_var): - # adds next level to nodes/edges - for pub in references_pub_obj_list: - new_reference_pub_obj_list = create_graph_structure_references(pub, search_depth, search_depth_max, test_var) - - # If the maximum height has not yet been reached, calls function recursivly with increased height - if (search_depth < search_depth_max): - process_references_rec(new_reference_pub_obj_list, search_depth+1, search_depth_max, test_var) - - - - -# adds a node for every publication unknown -# adds edges for citations between publications -def create_graph_structure_citations(pub, search_height, search_height_max, test_var): - citations_pub_obj_list = [] - for citation in pub.citations: - not_in_nodes = True - for node in nodes: - # checks every citation for duplication - if (citation.doi_url == node.doi_url): - not_in_nodes = False - break - if (not_in_nodes): - if (search_height < search_height_max): - - #checks if its a test and chooses input function accordingly - if (test_var): - citation_pub_obj = input_test_func(citation.doi_url) - else: - citation_pub_obj = input(citation.doi_url) - - citation_pub_obj.group = "height" - nodes.append(citation_pub_obj) - edges.append([citation_pub_obj.doi_url,pub.doi_url]) - citations_pub_obj_list.append(citation_pub_obj) - - # adds only edge if citation already exists - elif [citation.doi_url,pub.doi_url] not in edges: - edges.append([citation.doi_url,pub.doi_url]) - return citations_pub_obj_list - - - -# recursive function to implement height-first-search on citations -# citations_pub_obj_list: input list of citations as publication objects -# search_height: current search_height of height-first-search -# search_height_max: maximal search_height for dfs -def process_citations_rec(citations_pub_obj_list, search_height, search_height_max, test_var): - # adds next level to nodes/edges - for pub in citations_pub_obj_list: - new_citation_pub_obj_list = create_graph_structure_citations(pub, search_height, search_height_max, test_var) - - # If the maximum height has not yet been reached, calls function recursivly with increased height - if (search_height < search_height_max): - process_citations_rec(new_citation_pub_obj_list, search_height+1, search_height_max, test_var) - - - - - -def process_main(doi_input_list, search_height, search_depth, test_var = False): - # ERROR-Handling doi_array = NULL - if (len(doi_input_list) == 0): - print("Error, no input data") - - # ERROR- if a negative number is entered for height - if (search_height < 0): - print("Error, search_height of search must be positive") - - # ERROR- if a negative number is entered for depth - if (search_depth < 0): - print("Error, search_depth of search must be positive") - - # create empty array for the nodes - # create empty array for the edges - global nodes, edges - nodes = [] - edges = [] - - # initializes nodes/edges from input and gets a list with publication objects for citations and references returned - references_obj_list, citations_obj_list = initialize_nodes_list(doi_input_list,search_depth, search_height, test_var) - process_citations_rec(citations_obj_list, 1, search_height, test_var) - process_references_rec(references_obj_list, 1, search_depth, test_var) - complete_inner_edges(test_var) - - output_to_json(nodes,edges) - - # only for internal testing - doi_nodes = [] - for node in nodes: - doi_nodes.append(node.doi_url) - return(doi_nodes,edges) - - - - -# a function to print nodes and edges from a graph -def print_graph(nodes, edges): - print("Knoten:\n") - for node in nodes: - print(node.title, "\n") - print("\nKanten:\n") - for edge in edges: - print(edge,"\n") - - -# function to test cycles -def test_cycle(): - arr = [] - arr.append('doiz1') - #arr.append('doiz2') - - nodes,edges = process_main(arr,1,1,True) - - print(nodes, edges) - - print_graph(nodes, edges) - -# program test, because there is no connection to the input yet. -def test_print(): - arr = [] - #arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249') - #arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249') - #arr.append('https://doi.org/10.1021/acs.jmedchem.0c01332') - #arr.append('https://doi.org/10.1021/acs.jcim.0c00741') - - #arr.append('https://doi.org/10.1021/ci700007b') - #arr.append('https://doi.org/10.1021/acs.jcim.5b00292') - #url = sys.argv[1] - #arr.append[url] - - - nodes,edges = process_main(arr,2,2,True) - - print_graph(nodes, edges) - -#test_print() -#test_cycle() -#print(process_main(['doiz1'],1,1,True)) -#print(process_main(['doi1'],0,0,True)) - - \ No newline at end of file diff --git a/verarbeitung/Processing_unittest.py b/verarbeitung/Processing_unittest.py index 772d57204ce3374211d1d1fd3d08d279f085aac3..fd11131fac147f55ff2a948a32831a76e926964b 100644 --- a/verarbeitung/Processing_unittest.py +++ b/verarbeitung/Processing_unittest.py @@ -1,5 +1,5 @@ import unittest -from Processing import process_main +from Processing_pub_objs_only import process_main class ProcessingTest(unittest.TestCase): def testCycle(self): diff --git a/verarbeitung/__pycache__/Processing.cpython-39.pyc b/verarbeitung/__pycache__/Processing.cpython-39.pyc index e08ca682bcdfcff17580d1e2c0923b6aac9ce00d..54c63251bbf3affbdd176d3d55f4956c2fc08406 100644 Binary files a/verarbeitung/__pycache__/Processing.cpython-39.pyc and b/verarbeitung/__pycache__/Processing.cpython-39.pyc differ diff --git a/verarbeitung/__pycache__/Processing_pub_objs_only.cpython-39.pyc b/verarbeitung/__pycache__/Processing_pub_objs_only.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ce1023e6ea54e1b04b37ad5a1fd08115d5f52a4 Binary files /dev/null and b/verarbeitung/__pycache__/Processing_pub_objs_only.cpython-39.pyc differ