diff --git a/verarbeitung/Processing_pub_objs_only.py b/verarbeitung/Processing_pub_objs_only.py new file mode 100644 index 0000000000000000000000000000000000000000..a6c1ed30aaa2e775c87979921895df9c88867d4d --- /dev/null +++ b/verarbeitung/Processing_pub_objs_only.py @@ -0,0 +1,255 @@ +# -*- coding: utf-8 -*- +""" +Functions to generate a graph representing citations between multiple ACS/Nature journals + +""" + +__authors__ = "Donna Löding, Alina Molkentin, Xinyi Tang, Judith Große, Malte Schokolowski" +__email__ = "cis-project2021@zbh.uni-hamburg.de" +__status__ = "Production" +#__copyright__ = "" +#__credits__ = ["", "", "", ""] +#__license__ = "" +#__version__ = "" +#__maintainer__ = "" + +from bs4 import BeautifulSoup as bs +import requests as req +import sys +from pathlib import Path +from input_fj import input +from input_test import input_test_func +from json_demo import output_to_json + +# adds every publication from input list to graph structure +# doi_input_list: list of publication dois from user +def initialize_nodes_list(doi_input_list, search_depth_max, search_height_max, test_var): + references_pub_obj_list = [] + citations_pub_obj_list = [] + + for pub_doi in doi_input_list: + + #checks if its a test and chooses input function accordingly + if(test_var): + pub = input_test_func(pub_doi) + else: + pub = input(pub_doi) + + # checks if publication already exists in nodes + not_in_nodes = True + for node in nodes: # checks if a pub is already in nodes + if (pub.doi_url == node.doi_url): + not_in_nodes = False + break + if (not_in_nodes): + nodes.append(pub) + pub.group = "input" + else: + doi_input_list.remove(pub_doi) + + # inserts references as publication objects into list and + # inserts first depth references into nodes/edges if maximum search depth > 0 + for reference in create_graph_structure_references(pub, 0, search_depth_max, test_var): + references_pub_obj_list.append(reference) + + # inserts citations as publication objects into list and + # inserts first height citations into nodes if maximum search height > 0 + for citation in create_graph_structure_citations(pub, 0, search_height_max, test_var): + citations_pub_obj_list.append(citation) + + return(references_pub_obj_list, citations_pub_obj_list) + + +# adds edges between citation and reference group +def complete_inner_edges(test_var): + for node in nodes: + if (node.group == "depth"): + for citation in node.citations: + if (citation in nodes and [citation.doi_url, node.doi_url] not in edges): + edges.append([citation.doi_url, node.doi_url]) + if (node.group == "height"): + for reference in node.references: + if (reference in nodes and [node.doi_url, reference.doi_url] not in edges): + edges.append([node.doi_url,reference.doi_url]) + + + + +# adds a node for every publication unknown +# adds edges for references between publications +def create_graph_structure_references(pub, search_depth, search_depth_max, test_var): + references_pub_obj_list = [] + for reference in pub.references: + not_in_nodes = True + for node in nodes: + # checks every reference for duplication + if (reference.doi_url == node.doi_url): + not_in_nodes = False + break + if (not_in_nodes): + if (search_depth < search_depth_max): + + #checks if its a test and chooses input function accordingly + if (test_var): + reference_pub_obj = input_test_func(reference.doi_url) + else: + reference_pub_obj = input(reference.doi_url) + + reference_pub_obj.group = "depth" + nodes.append(reference_pub_obj) + edges.append([pub.doi_url,reference_pub_obj.doi_url]) + references_pub_obj_list.append(reference_pub_obj) + + # adds edge only if citation already exists + elif [pub.doi_url,reference.doi_url] not in edges: + edges.append([pub.doi_url,reference.doi_url]) + return references_pub_obj_list + + +# recursive function to implement height-first-search on references +# references_pub_obj_list: input list of references as publication objects +# search_depth: current search_depth of height-first-search +# search_depth_max: maximal search_depth for dfs +def process_references_rec(references_pub_obj_list, search_depth, search_depth_max, test_var): + # adds next level to nodes/edges + for pub in references_pub_obj_list: + new_reference_pub_obj_list = create_graph_structure_references(pub, search_depth, search_depth_max, test_var) + + # If the maximum height has not yet been reached, calls function recursivly with increased height + if (search_depth < search_depth_max): + process_references_rec(new_reference_pub_obj_list, search_depth+1, search_depth_max, test_var) + + + + +# adds a node for every publication unknown +# adds edges for citations between publications +def create_graph_structure_citations(pub, search_height, search_height_max, test_var): + citations_pub_obj_list = [] + for citation in pub.citations: + not_in_nodes = True + for node in nodes: + # checks every citation for duplication + if (citation.doi_url == node.doi_url): + not_in_nodes = False + break + if (not_in_nodes): + if (search_height < search_height_max): + + #checks if its a test and chooses input function accordingly + if (test_var): + citation_pub_obj = input_test_func(citation.doi_url) + else: + citation_pub_obj = input(citation.doi_url) + + citation_pub_obj.group = "height" + nodes.append(citation_pub_obj) + edges.append([citation_pub_obj.doi_url,pub.doi_url]) + citations_pub_obj_list.append(citation_pub_obj) + + # adds only edge if citation already exists + elif [citation.doi_url,pub.doi_url] not in edges: + edges.append([citation.doi_url,pub.doi_url]) + return citations_pub_obj_list + + + +# recursive function to implement height-first-search on citations +# citations_pub_obj_list: input list of citations as publication objects +# search_height: current search_height of height-first-search +# search_height_max: maximal search_height for dfs +def process_citations_rec(citations_pub_obj_list, search_height, search_height_max, test_var): + # adds next level to nodes/edges + for pub in citations_pub_obj_list: + new_citation_pub_obj_list = create_graph_structure_citations(pub, search_height, search_height_max, test_var) + + # If the maximum height has not yet been reached, calls function recursivly with increased height + if (search_height < search_height_max): + process_citations_rec(new_citation_pub_obj_list, search_height+1, search_height_max, test_var) + + + + + +def process_main(doi_input_list, search_height, search_depth, test_var = False): + # ERROR-Handling doi_array = NULL + if (len(doi_input_list) == 0): + print("Error, no input data") + + # ERROR- if a negative number is entered for height + if (search_height < 0): + print("Error, search_height of search must be positive") + + # ERROR- if a negative number is entered for depth + if (search_depth < 0): + print("Error, search_depth of search must be positive") + + # create empty array for the nodes + # create empty array for the edges + global nodes, edges + nodes = [] + edges = [] + + # initializes nodes/edges from input and gets a list with publication objects for citations and references returned + references_obj_list, citations_obj_list = initialize_nodes_list(doi_input_list,search_depth, search_height, test_var) + process_citations_rec(citations_obj_list, 1, search_height, test_var) + process_references_rec(references_obj_list, 1, search_depth, test_var) + complete_inner_edges(test_var) + + output_to_json(nodes,edges) + + # only for internal testing + doi_nodes = [] + for node in nodes: + doi_nodes.append(node.doi_url) + return(doi_nodes,edges) + + + + +# a function to print nodes and edges from a graph +def print_graph(nodes, edges): + print("Knoten:\n") + for node in nodes: + print(node.title, "\n") + print("\nKanten:\n") + for edge in edges: + print(edge,"\n") + + +# function to test cycles +def test_cycle(): + arr = [] + arr.append('doiz1') + #arr.append('doiz2') + + nodes,edges = process_main(arr,1,1,True) + + print(nodes, edges) + + print_graph(nodes, edges) + +# program test, because there is no connection to the input yet. +def test_print(): + arr = [] + #arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249') + #arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249') + #arr.append('https://doi.org/10.1021/acs.jmedchem.0c01332') + #arr.append('https://doi.org/10.1021/acs.jcim.0c00741') + + #arr.append('https://doi.org/10.1021/ci700007b') + #arr.append('https://doi.org/10.1021/acs.jcim.5b00292') + #url = sys.argv[1] + #arr.append[url] + + + nodes,edges = process_main(arr,2,2,True) + + print_graph(nodes, edges) + +#test_print() +#test_cycle() +#print(process_main(['doiz1'],1,1,True)) +#print(process_main(['doi1'],0,0,True)) + + \ No newline at end of file